thr3ads.net - Ocfs2 devel - [Ocfs2-devel] [PATCH 00/39] ocfs2: Add reflink file support. V3 [Apr 2009]

If this information is useful, please help other people find it:
Share via:

Tao Ma

2009-Apr-29 22:58 UTC

[Ocfs2-devel] [PATCH 01/39] ocfs2: Define refcount tree structure.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/ocfs2_fs.h |   85 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7ab6e9e..8d47d74 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -68,6 +68,7 @@
 #define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
 #define OCFS2_DX_ROOT_SIGNATURE		"DXDIR01"
 #define OCFS2_DX_LEAF_SIGNATURE		"DXLEAF1"
+#define OCFS2_REFCOUNT_BLOCK_SIGNATURE	"REFCNT1"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -160,6 +161,9 @@
 /* Metadata checksum and error correction */
 #define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
 
+/* Refcount tree support */
+#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE	0x1000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -223,6 +227,7 @@
 #define OCFS2_HAS_XATTR_FL	(0x0002)
 #define OCFS2_INLINE_XATTR_FL	(0x0004)
 #define OCFS2_INDEXED_DIR_FL	(0x0008)
+#define OCFS2_HAS_REFCOUNT_FL   (0x0010)
 
 /* Inode attributes, keep in sync with EXT2 */
 #define OCFS2_SECRM_FL		(0x00000001)	/* Secure deletion */
@@ -241,8 +246,11 @@
 /*
  * Extent record flags (e_node.leaf.flags)
  */
-#define OCFS2_EXT_UNWRITTEN	(0x01)	/* Extent is allocated but
-					 * unwritten */
+#define OCFS2_EXT_UNWRITTEN		(0x01)	/* Extent is allocated but
+						 * unwritten */
+#define OCFS2_EXT_REFCOUNTED		(0x02)  /* Extent is reference
+						 * counted in an associated
+						 * refcount tree */
 
 /*
  * ioctl commands
@@ -717,7 +725,8 @@ struct ocfs2_dinode {
 	__le64 i_xattr_loc;
 /*80*/	struct ocfs2_block_check i_check;	/* Error checking */
 /*88*/	__le64 i_dx_root;		/* Pointer to dir index root block */
-	__le64 i_reserved2[5];
+/*90*/	__le64 i_refcount_loc;
+	__le64 i_reserved2[4];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -901,6 +910,56 @@ struct ocfs2_group_desc
 /*40*/	__u8    bg_bitmap[0];
 };
 
+struct ocfs2_refcount_rec {
+/*00*/  __le64 r_cpos;		/* Physical offset, in clusters */
+	__le32 r_clusters;	/* Clusters covered by this extent */
+	__le32 r_refcount;	/* Reference count of this extent */
+/*10*/
+};
+
+#define OCFS2_REFCOUNT_LEAF_FL          (0x00000001)
+#define OCFS2_REFCOUNT_TREE_FL          (0x00000002)
+
+struct ocfs2_refcount_list {
+/*00*/  __le16 rl_count;	/* Maximum number of entries possible
+				   in rl_records */
+	__le16 rl_used;		/* Current number of used records */
+	__le32 rl_reserved2;
+	__le64 rl_reserved1;	/* Pad to sizeof(ocfs2_refcount_record) */
+/*10*/	struct ocfs2_refcount_rec rl_recs[0];	/* Refcount records */
+};
+
+
+struct ocfs2_refcount_block {
+/*00*/	__u8 rf_signature[8];		/* Signature for verification */
+	__le16 rf_suballoc_slot;	/* Slot suballocator this block
+					   belongs to */
+	__le16 rf_suballoc_bit;		/* Bit offset in suballocator
+					   block group */
+	__le32 rf_fs_generation;	/* Must match superblock */
+/*10*/	__le64 rf_blkno;		/* Offset on disk, in blocks */
+	__le64 rf_parent;		/* Parent block, only valid if
+					   OCFS2_REFCOUNT_LEAF_FL is set in
+					   rf_flags */
+/*20*/	struct ocfs2_block_check rf_check;	/* Error checking */
+	__le64 rf_last_eb_blk;		/* Pointer to last extent block */
+/*30*/	__le32 rf_count;		/* Number of inodes sharing this
+					   refcount tree */
+	__le32 rf_flags;		/* See the flags above */
+	__le32 rf_clusters;		/* clusters covered by refcount tree. */
+	__le32 rf_cpos;			/* cluster offset in refcount tree.*/
+/*40*/	__le64 rf_reserved1[8];
+/*80*/	union {
+		struct ocfs2_refcount_list rf_records;  /* List of refcount
+							  records */
+		struct ocfs2_extent_list rf_list;	/* Extent record list,
+							only valid if
+							OCFS2_REFCOUNT_TREE_FL
+							is set in rf_flags */
+	};
+/* Actual on-disk size is one block */
+};
+
 /*
  * On disk extended attribute structure for OCFS2.
  */
@@ -1312,6 +1371,26 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct
super_block *sb)
 
 	return size / sizeof(struct ocfs2_extent_rec);
 }
+
+static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
+
+	return size / sizeof(struct ocfs2_refcount_rec);
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 02/39] ocfs2: Add metaecc for ocfs2_refcount_block.

Add metaecc and journal trigger for ocfs2_refcount_block.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/journal.c |   15 +++++++++++++++
 fs/ocfs2/journal.h |    3 +++
 2 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 6ae353c..02c5a21 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -551,6 +551,14 @@ static struct ocfs2_triggers eb_triggers = {
 	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
 };
 
+static struct ocfs2_triggers rb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_refcount_block, rf_check),
+};
+
 static struct ocfs2_triggers gd_triggers = {
 	.ot_triggers = {
 		.t_commit = ocfs2_commit_trigger,
@@ -673,6 +681,13 @@ int ocfs2_journal_access_eb(handle_t *handle, struct
ocfs2_caching_info *ci,
 	return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
 }
 
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
+				      type);
+}
+
 int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
 			    struct buffer_head *bh, int type)
 {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index ea22691..08196e0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -276,6 +276,9 @@ int ocfs2_journal_access_di(handle_t *handle, struct
ocfs2_caching_info *ci,
 /* ocfs2_extent_block */
 int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
 			    struct buffer_head *bh, int type);
+/* ocfs2_refcount_block */
+int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
+			    struct buffer_head *bh, int type);
 /* ocfs2_group_desc */
 int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
 			    struct buffer_head *bh, int type);
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 03/39] ocfs2: Add ocfs2_read_refcount_block.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/Makefile          |    1 +
 fs/ocfs2/cluster/masklog.c |    1 +
 fs/ocfs2/cluster/masklog.h |    1 +
 fs/ocfs2/ocfs2.h           |    3 +
 fs/ocfs2/refcounttree.c    |   99 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 105 insertions(+), 0 deletions(-)
 create mode 100644 fs/ocfs2/refcounttree.c

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 0159607..31f25ce 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -28,6 +28,7 @@ ocfs2-objs := \
 	locks.o			\
 	mmap.o 			\
 	namei.o 		\
+	refcounttree.o		\
 	resize.o		\
 	slot_map.o 		\
 	suballoc.o 		\
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 96df541..1cd2934 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(EXPORT),
 	define_mask(XATTR),
 	define_mask(QUOTA),
+	define_mask(REFCOUNT),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7e72a81..d233ec2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
+#define ML_REFCOUNT	0x0000000080000000ULL /* refcount tree operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 320c51b..f02b939 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -570,6 +570,9 @@ static inline int ocfs2_uses_extended_slot_map(struct
ocfs2_super *osb)
 #define OCFS2_IS_VALID_DX_LEAF(ptr)					\
 	(!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
 
+#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr)				\
+	(!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 0000000..9790032
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,99 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.c
+ *
+ * Copyright (C) 2004, 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#define MLOG_MASK_PREFIX ML_REFCOUNT
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "suballoc.h"
+#include "journal.h"
+#include "uptodate.h"
+#include "super.h"
+#include "buffer_head_io.h"
+#include "blockcheck.h"
+
+static int ocfs2_validate_refcount_block(struct super_block *sb,
+					 struct buffer_head *bh)
+{
+	int rc;
+	struct ocfs2_refcount_block *rb +		(struct ocfs2_refcount_block
*)bh->b_data;
+
+	mlog(0, "Validating refcount block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+		return rc;
+	}
+
+
+	if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    rb->rf_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid rf_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(rb->rf_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Refcount block #%llu has an invalid "
+			    "rf_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(rb->rf_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
+				     u64 rb_blkno,
+				     struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(ci, rb_blkno, &tmp,
+			      ocfs2_validate_refcount_block);
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 04/39] ocfs2: Basic tree root operation.

Add basic refcount tree root operation.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/journal.h      |    3 +
 fs/ocfs2/refcounttree.c |  273 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |   26 +++++
 3 files changed, 302 insertions(+), 0 deletions(-)
 create mode 100644 fs/ocfs2/refcounttree.h

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 08196e0..bfd5942 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -482,6 +482,9 @@ static inline int ocfs2_calc_dxi_expand_credits(struct
super_block *sb)
 	return credits;
 }
 
+#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+					    + OCFS2_SUBALLOC_ALLOC + 1)
+#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
 /*
  * Please note that the caller must make sure that root_el is the root
  * of extent tree. So for an inode, it should be &fe->id2.i_list.
Otherwise
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9790032..d158936 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -26,6 +26,9 @@
 #include "super.h"
 #include "buffer_head_io.h"
 #include "blockcheck.h"
+#include "refcounttree.h"
+#include "sysfile.h"
+#include "dlmglue.h"
 
 static int ocfs2_validate_refcount_block(struct super_block *sb,
 					 struct buffer_head *bh)
@@ -97,3 +100,273 @@ static int ocfs2_read_refcount_block(struct
ocfs2_caching_info *ci,
 
 	return rc;
 }
+
+/*
+ * Create a refcount tree for an inode.
+ * We take for granted that the inode is already locked.
+ */
+int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+	BUG_ON(di->i_refcount_loc);
+
+	mlog(0, "create tree for inode %lu\n", inode->i_ino);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &first_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(rb, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
+	rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
+	rb->rf_blkno = cpu_to_le64(first_blkno);
+	rb->rf_count = cpu_to_le32(1);
+	rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	rb->rf_records.rl_count +		
cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
+
+	ocfs2_journal_dirty(handle, new_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(first_blkno);
+	spin_unlock(&oi->ip_lock);
+
+	mlog(0, "created tree for inode %lu, refblock %llu\n",
+	     inode->i_ino, (unsigned long long)first_blkno);
+
+	ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	brelse(new_bh);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+int ocfs2_set_refcount_tree(struct inode *inode,
+			    struct buffer_head *di_bh,
+			    u64 refcount_loc)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_block *rb;
+
+	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+	BUG_ON(di->i_refcount_loc);
+
+	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+					refcount_loc, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	le32_add_cpu(&rb->rf_count, 1);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = cpu_to_le64(refcount_loc);
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	brelse(ref_root_bh);
+
+	return ret;
+}
+
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
+{
+	int ret;
+	handle_t *handle = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct inode *alloc_inode = NULL;
+	struct buffer_head *alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	int credits = OCFS2_INODE_UPDATE_CREDITS + 1;
+	u64 blk = 0, bg_blkno = 0;
+	u16 bit = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+		return 0;
+
+	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+					le64_to_cpu(di->i_refcount_loc),
+					&blk_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
+
+	/*
+	 * If we are the last user, we need to free the block.
+	 * So lock the allocator ahead.
+	 */
+	if (le32_to_cpu(rb->rf_count) == 1) {
+		blk = le64_to_cpu(rb->rf_blkno);
+		bit = le16_to_cpu(rb->rf_suballoc_bit);
+		bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+		alloc_inode = ocfs2_get_system_file_inode(osb,
+					EXTENT_ALLOC_SYSTEM_INODE,
+					le16_to_cpu(rb->rf_suballoc_slot));
+		if (!alloc_inode) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+		mutex_lock(&alloc_inode->i_mutex);
+
+		ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_mutex;
+		}
+
+		credits += OCFS2_SUBALLOC_FREE;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), blk_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	di->i_refcount_loc = 0;
+	spin_unlock(&oi->ip_lock);
+	ocfs2_journal_dirty(handle, di_bh);
+
+	le32_add_cpu(&rb->rf_count , -1);
+	ocfs2_journal_dirty(handle, blk_bh);
+
+	if (!rb->rf_count) {
+		ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
+					       alloc_bh, bit, bg_blkno, 1);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	if (alloc_inode) {
+		ocfs2_inode_unlock(alloc_inode, 1);
+		brelse(alloc_bh);
+	}
+out_mutex:
+	if (alloc_inode) {
+		mutex_unlock(&alloc_inode->i_mutex);
+		iput(alloc_inode);
+	}
+out:
+	brelse(blk_bh);
+
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 0000000..9f4bdac
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,26 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * refcounttree.h
+ *
+ * Copyright (C) 2004, 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_REFCOUNTTREE_H
+#define OCFS2_REFCOUNTTREE_H
+
+int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_set_refcount_tree(struct inode *inode,
+			    struct buffer_head *di_bh,
+			    u64 blkno);
+int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+
+#endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 05/39] ocfs2: hook remove refcount tree into truncate.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 6f20f30..0c96b27 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -51,6 +51,7 @@
 #include "xattr.h"
 
 #include "buffer_head_io.h"
+#include "refcounttree.h"
 
 
 /*
@@ -7305,6 +7306,10 @@ start:
 	goto start;
 
 bail:
+	if (!status && OCFS2_I(inode)->ip_clusters == 0) {
+		/* remove the refcount tree. */
+		status = ocfs2_remove_refcount_tree(inode, fe_bh);
+	}
 
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 06/39] ocfs2: Wrap ocfs2_extent_contig in ocfs2_extent_tree.

Add a new operation eo_ocfs2_extent_contig int extent tree's
operation. So that with the new refcount tree, we can calculate
whether they are contiguous in its own function.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |   57 ++++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0c96b27..1fc56f3 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -53,7 +53,17 @@
 #include "buffer_head_io.h"
 #include "refcounttree.h"
 
+enum ocfs2_contig_type {
+	CONTIG_NONE = 0,
+	CONTIG_LEFT,
+	CONTIG_RIGHT,
+	CONTIG_LEFTRIGHT,
+};
 
+static enum ocfs2_contig_type
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec);
 /*
  * Operations for a specific extent tree type.
  *
@@ -123,6 +133,16 @@ struct ocfs2_extent_tree_operations {
 	 * to 0 (unlimited).  Optional.
 	 */
 	void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_ocfs2_extent_contig test whether the 2 ocfs2_extent_rec
+	 * are contiguous or not. Optional. Don't need to set it if use
+	 * ocfs2_extent_rec as the tree leaf.
+	 */
+	enum ocfs2_contig_type
+		(*eo_ocfs2_extent_contig)(struct super_block *sb,
+					  struct ocfs2_extent_rec *ext,
+					  struct ocfs2_extent_rec *insert_rec);
 };
 
 
@@ -386,6 +406,9 @@ static void __ocfs2_init_extent_tree(struct
ocfs2_extent_tree *et,
 		et->et_max_leaf_clusters = 0;
 	else
 		et->et_ops->eo_fill_max_leaf_clusters(et);
+
+	if (!et->et_ops->eo_ocfs2_extent_contig)
+		et->et_ops->eo_ocfs2_extent_contig = ocfs2_extent_rec_contig;
 }
 
 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
@@ -459,6 +482,16 @@ static inline int ocfs2_et_root_journal_access(handle_t
*handle,
 					  type);
 }
 
+static inline enum ocfs2_contig_type
+	ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *ext,
+			       struct ocfs2_extent_rec *insert_rec)
+{
+	return et->et_ops->eo_ocfs2_extent_contig(
+			ocfs2_metadata_cache_get_super(et->et_ci),
+			ext, insert_rec);
+}
+
 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
 					struct ocfs2_extent_rec *rec)
 {
@@ -731,17 +764,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
u32 v_cluster)
 	return ret;
 }
 
-enum ocfs2_contig_type {
-	CONTIG_NONE = 0,
-	CONTIG_LEFT,
-	CONTIG_RIGHT,
-	CONTIG_LEFTRIGHT,
-};
-
-
 /*
  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
- * ocfs2_extent_contig only work properly against leaf nodes!
+ * ocfs2_extent_rec_contig only work properly against leaf nodes!
  */
 static int ocfs2_block_extent_contig(struct super_block *sb,
 				     struct ocfs2_extent_rec *ext,
@@ -767,9 +792,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec
*left,
 }
 
 static enum ocfs2_contig_type
-	ocfs2_extent_contig(struct super_block *sb,
-			    struct ocfs2_extent_rec *ext,
-			    struct ocfs2_extent_rec *insert_rec)
+	ocfs2_extent_rec_contig(struct super_block *sb,
+				struct ocfs2_extent_rec *ext,
+				struct ocfs2_extent_rec *insert_rec)
 {
 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
 
@@ -4294,7 +4319,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree
*et,
 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
 				ret = CONTIG_RIGHT;
 		} else {
-			ret = ocfs2_extent_contig(sb, rec, split_rec);
+			ret = ocfs2_et_extent_contig(et, rec, split_rec);
 		}
 	}
 
@@ -4339,7 +4364,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree
*et,
 	if (rec) {
 		enum ocfs2_contig_type contig_type;
 
-		contig_type = ocfs2_extent_contig(sb, rec, split_rec);
+		contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
 
 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
 			ret = CONTIG_LEFTRIGHT;
@@ -4367,8 +4392,8 @@ static void ocfs2_figure_contig_type(struct
ocfs2_extent_tree *et,
 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-		contig_type =
ocfs2_extent_contig(ocfs2_metadata_cache_get_super(et->et_ci),
-						  &el->l_recs[i], insert_rec);
+		contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
+						     insert_rec);
 		if (contig_type != CONTIG_NONE) {
 			insert->ins_contig_index = i;
 			break;
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 07/39] ocfs2: Abstract extent split process.

ocfs2_mark_extent_written actually does the following things:
1. check the parameters.
2. initialize the left_path and split_rec.
3. call __ocfs2_mark_extent_written. it will do:
   1) check the flags of unwritten
   2) do the real split work.
The whole process is packed tightly somehow. So this patch
will abstract 2 different functions so that future b-tree
operation can work with it.

1. __ocfs2_split_extent will accept path and split_rec and do
  the real split work.
2. ocfs2_change_extent_flag will accept a new flag and initialize
   path and split_rec.

So now ocfs2_mark_extent_written will do:
1. check the parameters.
2. call ocfs2_change_extent_flag.
   1) initalize the left_path and split_rec.
   2) check whether the new flags conflict with the old one.
   3) call __ocfs2_split_extent to do the split.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |  142 ++++++++++++++++++++++++++++++++----------------------
 1 files changed, 85 insertions(+), 57 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 1fc56f3..b06db13 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4926,10 +4926,9 @@ out:
 }
 
 /*
- * Mark part or all of the extent record at split_index in the leaf
- * pointed to by path as written. This removes the unwritten
- * extent flag.
- *
+ * Split part or all of the extent record at split_index in the leaf
+ * pointed to by path. Merge with the contiguous extent record if needed.
+
  * Care is taken to handle contiguousness so as to not grow the tree.
  *
  * meta_ac is not strictly necessary - we only truly need it if growth
@@ -4945,13 +4944,13 @@ out:
  * have been brought into cache (and pinned via the journal), so the
  * extra overhead is not expressed in terms of disk reads.
  */
-static int __ocfs2_mark_extent_written(handle_t *handle,
-				       struct ocfs2_extent_tree *et,
-				       struct ocfs2_path *path,
-				       int split_index,
-				       struct ocfs2_extent_rec *split_rec,
-				       struct ocfs2_alloc_context *meta_ac,
-				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+static int __ocfs2_split_extent(handle_t *handle,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_path *path,
+				int split_index,
+				struct ocfs2_extent_rec *split_rec,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0;
 	struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -4960,12 +4959,6 @@ static int __ocfs2_mark_extent_written(handle_t *handle,
 	struct ocfs2_merge_ctxt ctxt;
 	struct ocfs2_extent_list *rightmost_el;
 
-	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
-		ret = -EIO;
-		mlog_errno(ret);
-		goto out;
-	}
-
 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
 	     (le32_to_cpu(split_rec->e_cpos) +
le16_to_cpu(split_rec->e_leaf_clusters)))) {
@@ -5034,43 +5027,20 @@ out:
 	return ret;
 }
 
-/*
- * Mark the already-existing extent at cpos as written for len clusters.
- *
- * If the existing extent is larger than the request, initiate a
- * split. An attempt will be made at merging with adjacent extents.
- *
- * The caller is responsible for passing down meta_ac if we'll need it.
- */
-int ocfs2_mark_extent_written(struct inode *inode,
-			      struct ocfs2_extent_tree *et,
-			      handle_t *handle, u32 cpos, u32 len, u32 phys,
-			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+static int ocfs2_change_extent_flag(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, u32 len, u32 phys,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc,
+				    int new_flags, int clear_flags)
 {
 	int ret, index;
-	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
 	struct ocfs2_extent_rec split_rec;
 	struct ocfs2_path *left_path = NULL;
 	struct ocfs2_extent_list *el;
-
-	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
-	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
-
-	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
-		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
-			    "that are being written to, but the feature bit "
-			    "is not set in the super block.",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		ret = -EROFS;
-		goto out;
-	}
-
-	/*
-	 * XXX: This should be fixed up so that we just re-insert the
-	 * next extent records.
-	 */
-	ocfs2_et_extent_map_truncate(et, 0);
+	struct ocfs2_extent_rec *rec;
 
 	left_path = ocfs2_new_path_from_et(et);
 	if (!left_path) {
@@ -5088,30 +5058,88 @@ int ocfs2_mark_extent_written(struct inode *inode,
 
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has an extent at cpos %u which can no "
+		ocfs2_error(sb,
+			    "Owner %llu has an extent at cpos %u which can no "
 			    "longer be found.\n",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+			     (unsigned long long)
+			     ocfs2_metadata_cache_owner(et->et_ci), cpos);
 		ret = -EROFS;
 		goto out;
 	}
 
+	rec = &el->l_recs[index];
+	if ((new_flags && (rec->e_flags & new_flags)) ||
+	    (clear_flags && !(rec->e_flags & clear_flags))) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
 	split_rec.e_cpos = cpu_to_le32(cpos);
 	split_rec.e_leaf_clusters = cpu_to_le16(len);
 	split_rec.e_blkno = cpu_to_le64(start_blkno);
-	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
-	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
+	split_rec.e_flags = rec->e_flags;
+	if (new_flags)
+		split_rec.e_flags |= new_flags;
+	else
+		split_rec.e_flags &= ~clear_flags;
 
-	ret = __ocfs2_mark_extent_written(handle, et, left_path,
-					  index, &split_rec, meta_ac,
-					  dealloc);
+	ret = __ocfs2_split_extent(handle, et, left_path,
+				  index, &split_rec, meta_ac,
+				  dealloc);
 	if (ret)
 		mlog_errno(ret);
 
 out:
 	ocfs2_free_path(left_path);
 	return ret;
+
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ * This removes the unwritten extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
+			      handle_t *handle, u32 cpos, u32 len, u32 phys,
+			      struct ocfs2_alloc_context *meta_ac,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
+	     inode->i_ino, cpos, len, phys);
+
+	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+			    "that are being written to, but the feature bit "
+			    "is not set in the super block.",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/*
+	 * XXX: This should be fixed up so that we just re-insert the
+	 * next extent records.
+	 */
+	ocfs2_et_extent_map_truncate(et, 0);
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       0, OCFS2_EXT_UNWRITTEN);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
 }
 
 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 08/39] ocfs2: Add refcount b-tree as a new extent tree.

Add refcount b-tree as a new extent tree so that it can
use the b-tree to store and maniuplate ocfs2_refcount_rec.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |   54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h |    3 +++
 2 files changed, 57 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b06db13..115c054 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -386,6 +386,52 @@ static struct ocfs2_extent_tree_operations
ocfs2_dx_root_et_ops = {
 	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
 };
 
+static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	et->et_root_el = &rb->rf_list;
+}
+
+static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+						u64 blkno)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	rb->rf_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	return le64_to_cpu(rb->rf_last_eb_blk);
+}
+
+static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
+						u32 clusters)
+{
+	struct ocfs2_refcount_block *rb = et->et_object;
+
+	le32_add_cpu(&rb->rf_clusters, clusters);
+}
+
+static enum ocfs2_contig_type
+ocfs2_refcount_tree_extent_contig(struct super_block *sb,
+				  struct ocfs2_extent_rec *ext,
+				  struct ocfs2_extent_rec *insert_rec)
+{
+	return CONTIG_NONE;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_refcount_tree_fill_root_el,
+	.eo_ocfs2_extent_contig = ocfs2_refcount_tree_extent_contig,
+};
+
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 				     struct ocfs2_caching_info *ci,
 				     struct buffer_head *bh,
@@ -443,6 +489,14 @@ void ocfs2_init_dx_root_extent_tree(struct
ocfs2_extent_tree *et,
 				 NULL, &ocfs2_dx_root_et_ops);
 }
 
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
+				 NULL, &ocfs2_refcount_tree_et_ops);
+}
+
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					    u64 new_last_eb_blk)
 {
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index bcf6aa4..df0e778 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -80,6 +80,9 @@ void ocfs2_init_xattr_value_extent_tree(struct
ocfs2_extent_tree *et,
 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
 				    struct ocfs2_caching_info *ci,
 				    struct buffer_head *bh);
+void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *bh);
 
 /*
  * Read an extent block into *bh.  If *bh is NULL, a bh will be
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 09/39] ocfs2: export tree operation functions.

Now fs/ocfs2/alloc.c has more than 7000 lines. It contains our
basic b-tree operation. Although we have already make our b-tree
operation generic, the basic structrue ocfs2_path which is used
to iterate one b-tree branch is still static and limited to only
used in alloc.c. As refcount tree need them and I don't want to
add any more b-tree unrelated code to alloc.c, export them out.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |   74 ++++++++++++++++-------------------------------------
 fs/ocfs2/alloc.h |   49 +++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 51 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 115c054..f8c41d6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -568,40 +568,12 @@ static inline int ocfs2_et_sanity_check(struct
ocfs2_extent_tree *et)
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt
*ctxt,
 					 struct ocfs2_extent_block *eb);
-
-/*
- * Structures which describe a path through a btree, and functions to
- * manipulate them.
- *
- * The idea here is to be as generic as possible with the tree
- * manipulation code.
- */
-struct ocfs2_path_item {
-	struct buffer_head		*bh;
-	struct ocfs2_extent_list	*el;
-};
-
-#define OCFS2_MAX_PATH_DEPTH	5
-
-struct ocfs2_path {
-	int				p_tree_depth;
-	ocfs2_journal_access_func	p_root_access;
-	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
-};
-
-#define path_root_bh(_path) ((_path)->p_node[0].bh)
-#define path_root_el(_path) ((_path)->p_node[0].el)
-#define path_root_access(_path)((_path)->p_root_access)
-#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
-#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
-#define path_num_items(_path) ((_path)->p_tree_depth + 1)
-
 /*
  * Reset the actual path elements so that we can re-use the structure
  * to build another path. Generally, this involves freeing the buffer
  * heads.
  */
-static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
 {
 	int i, start = 0, depth = 0;
 	struct ocfs2_path_item *node;
@@ -630,7 +602,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int
keep_root)
 	path->p_tree_depth = depth;
 }
 
-static void ocfs2_free_path(struct ocfs2_path *path)
+void ocfs2_free_path(struct ocfs2_path *path)
 {
 	if (path) {
 		ocfs2_reinit_path(path, 0);
@@ -728,13 +700,13 @@ static struct ocfs2_path *ocfs2_new_path(struct
buffer_head *root_bh,
 	return path;
 }
 
-static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
 {
 	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
 			      path_root_access(path));
 }
 
-static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
 {
 	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
 			      et->et_root_journal_access);
@@ -747,10 +719,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct
ocfs2_extent_tree *et)
  * I don't like the way this function's name looks next to
  * ocfs2_journal_access_path(), but I don't have a better one.
  */
-static int ocfs2_path_bh_journal_access(handle_t *handle,
-					struct ocfs2_caching_info *ci,
-					struct ocfs2_path *path,
-					int idx)
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx)
 {
 	ocfs2_journal_access_func access = path_root_access(path);
 
@@ -767,9 +739,9 @@ static int ocfs2_path_bh_journal_access(handle_t *handle,
 /*
  * Convenience function to journal all components in a path.
  */
-static int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
-				     handle_t *handle,
-				     struct ocfs2_path *path)
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path)
 {
 	int i, ret = 0;
 
@@ -1872,8 +1844,8 @@ static void find_path_ins(void *data, struct buffer_head
*bh)
 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
 	fp->index++;
 }
-static int ocfs2_find_path(struct ocfs2_caching_info *ci,
-			   struct ocfs2_path *path, u32 cpos)
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path, u32 cpos)
 {
 	struct find_path_data data;
 
@@ -4998,13 +4970,13 @@ out:
  * have been brought into cache (and pinned via the journal), so the
  * extra overhead is not expressed in terms of disk reads.
  */
-static int __ocfs2_split_extent(handle_t *handle,
-				struct ocfs2_extent_tree *et,
-				struct ocfs2_path *path,
-				int split_index,
-				struct ocfs2_extent_rec *split_rec,
-				struct ocfs2_alloc_context *meta_ac,
-				struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0;
 	struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5139,9 +5111,9 @@ static int ocfs2_change_extent_flag(handle_t *handle,
 	else
 		split_rec.e_flags &= ~clear_flags;
 
-	ret = __ocfs2_split_extent(handle, et, left_path,
-				  index, &split_rec, meta_ac,
-				  dealloc);
+	ret = ocfs2_split_extent(handle, et, left_path,
+				 index, &split_rec, meta_ac,
+				 dealloc);
 	if (ret)
 		mlog_errno(ret);
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index df0e778..3f43489 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -115,6 +115,14 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret);
 struct ocfs2_cached_dealloc_ctxt;
+struct ocfs2_path;
+int ocfs2_split_extent(handle_t *handle,
+		       struct ocfs2_extent_tree *et,
+		       struct ocfs2_path *path,
+		       int split_index,
+		       struct ocfs2_extent_rec *split_rec,
+		       struct ocfs2_alloc_context *meta_ac,
+		       struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_mark_extent_written(struct inode *inode,
 			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
@@ -254,4 +262,45 @@ static inline int ocfs2_is_empty_extent(struct
ocfs2_extent_rec *rec)
 	return !rec->e_leaf_clusters;
 }
 
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+	struct buffer_head		*bh;
+	struct ocfs2_extent_list	*el;
+};
+
+#define OCFS2_MAX_PATH_DEPTH	5
+
+struct ocfs2_path {
+	int				p_tree_depth;
+	ocfs2_journal_access_func	p_root_access;
+	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
+};
+
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
+
+void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
+void ocfs2_free_path(struct ocfs2_path *path);
+int ocfs2_find_path(struct ocfs2_caching_info *ci,
+		    struct ocfs2_path *path,
+		    u32 cpos);
+struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
+struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
+int ocfs2_path_bh_journal_access(handle_t *handle,
+				 struct ocfs2_caching_info *ci,
+				 struct ocfs2_path *path,
+				 int idx);
+int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+			      handle_t *handle,
+			      struct ocfs2_path *path);
 #endif /* OCFS2_ALLOC_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 10/39] ocfs2: Add support for incrementing refcount in the tree.

Given a physical cpos and length, increment the refcount
in the tree. If the extent has not been seen before, a refcount
record is created for it. Refcount records may be merged or
split by this operation.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/extent_map.c   |   15 +-
 fs/ocfs2/extent_map.h   |    5 +
 fs/ocfs2/ocfs2_fs.h     |    7 +
 fs/ocfs2/refcounttree.c |  918 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 938 insertions(+), 7 deletions(-)

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index dc9482c..40b5105 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct
ocfs2_extent_list *el,
  * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
  * containing el.
  */
-static int ocfs2_figure_hole_clusters(struct inode *inode,
-				      struct ocfs2_extent_list *el,
-				      struct buffer_head *eb_bh,
-				      u32 v_cluster,
-				      u32 *num_clusters)
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters)
 {
 	int ret, i;
 	struct buffer_head *next_eb_bh = NULL;
@@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 
-		ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+		ret = ocfs2_read_extent_block(ci,
 					      le64_to_cpu(eb->h_next_leaf_blk),
 					      &next_eb_bh);
 		if (ret) {
@@ -456,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 		 * field.
 		 */
 		if (hole_len) {
-			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+			ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
+							 el, eb_bh,
 							 v_cluster, &len);
 			if (ret) {
 				mlog_errno(ret);
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index b7dd973..9942f47 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -61,6 +61,11 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block,
int nr,
 			   struct buffer_head *bhs[], int flags,
 			   int (*validate)(struct super_block *sb,
 					   struct buffer_head *bh));
+int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
+			       struct ocfs2_extent_list *el,
+			       struct buffer_head *eb_bh,
+			       u32 v_cluster,
+			       u32 *num_clusters);
 static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
 					struct buffer_head **bh,
 					int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 8d47d74..66d0bd7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -916,6 +916,7 @@ struct ocfs2_refcount_rec {
 	__le32 r_refcount;	/* Reference count of this extent */
 /*10*/
 };
+#define OCFS2_32BIT_POS_MASK		(0xffffffffULL)
 
 #define OCFS2_REFCOUNT_LEAF_FL          (0x00000001)
 #define OCFS2_REFCOUNT_TREE_FL          (0x00000002)
@@ -1391,6 +1392,12 @@ static inline u16 ocfs2_refcount_recs_per_rb(struct
super_block *sb)
 
 	return size / sizeof(struct ocfs2_refcount_rec);
 }
+
+static inline u32
+ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
+{
+	return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d158936..bc4bfb8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -15,6 +15,7 @@
  * General Public License for more details.
  */
 
+#include <linux/sort.h>
 #define MLOG_MASK_PREFIX ML_REFCOUNT
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -29,6 +30,7 @@
 #include "refcounttree.h"
 #include "sysfile.h"
 #include "dlmglue.h"
+#include "extent_map.h"
 
 static int ocfs2_validate_refcount_block(struct super_block *sb,
 					 struct buffer_head *bh)
@@ -370,3 +372,919 @@ out:
 
 	return ret;
 }
+
+static void ocfs2_get_refcount_rec_from_list(struct ocfs2_caching_info *ci,
+					struct buffer_head *ref_leaf_bh,
+					u64 cpos, unsigned int len,
+					struct ocfs2_refcount_rec *ret_rec,
+					int *index)
+{
+	int i = 0;
+	struct ocfs2_refcount_block *rb +		(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = NULL;
+
+	for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
+		rec = &rb->rf_records.rl_recs[i];
+
+		if (le64_to_cpu(rec->r_cpos) +
+		    le32_to_cpu(rec->r_clusters) <= cpos)
+			continue;
+		else if (le64_to_cpu(rec->r_cpos) > cpos)
+			break;
+
+		/* ok, cpos fail in this rec. Just return. */
+		if (ret_rec)
+			*ret_rec = *rec;
+		goto out;
+	}
+
+	if (ret_rec) {
+		/* We meet with a hole here, so fake the rec. */
+		ret_rec->r_cpos = cpu_to_le64(cpos);
+		ret_rec->r_refcount = 0;
+		if (i < le16_to_cpu(rb->rf_records.rl_used) &&
+		    le64_to_cpu(rec->r_cpos) < cpos + len)
+			ret_rec->r_clusters +				cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
+		else
+			ret_rec->r_clusters = cpu_to_le32(len);
+	}
+
+out:
+	*index = i;
+	return;
+}
+/*
+ * Given a cpos and len, try to find the refcount record which contains cpos.
+ * 1. If cpos can be found in one refcount record, return the record.
+ * 2. If cpos can't be found, return a fake record which start from cpos
+ *    and end at a small value between cpos+len and start of the next record.
+ *    This fake record has r_count = 0.
+ */
+static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
+				  struct buffer_head *ref_root_bh,
+				  u64 cpos, unsigned int len,
+				  struct ocfs2_refcount_rec *ret_rec,
+				  int *index,
+				  struct buffer_head **ret_bh)
+{
+	int ret = 0, i, found;
+	u32 low_cpos;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *tmp, *rec = NULL;
+	struct ocfs2_extent_block *eb;
+	struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_root_bh->b_data;
+
+	if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
+		ocfs2_get_refcount_rec_from_list(ci, ref_root_bh, cpos, len,
+						 ret_rec, index);
+		*ret_bh = ref_root_bh;
+		get_bh(ref_root_bh);
+		return 0;
+	}
+
+	el = &rb->rf_list;
+	low_cpos = cpos & OCFS2_32BIT_POS_MASK;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(sb,
+			"refcount tree %llu has non zero tree "
+			"depth in leaf btree tree block %llu\n",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci),
+			(unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	found = 0;
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		ocfs2_error(sb, "refcount tree %llu has bad extent "
+			"record (%u, %u, 0) in btree",
+			(unsigned long long)ocfs2_metadata_cache_owner(ci),
+			le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	/* adjust len when we have ocfs2_extent_rec after it. */
+	if (i < le16_to_cpu(el->l_next_free_rec) - 1) {
+		tmp = &el->l_recs[i+1];
+
+		if (le32_to_cpu(tmp->e_cpos) < cpos + len)
+			len = le32_to_cpu(tmp->e_cpos) - cpos;
+	}
+
+	ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
+					&ref_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_get_refcount_rec_from_list(ci, ref_leaf_bh, cpos, len,
+					 ret_rec, index);
+	*ret_bh = ref_leaf_bh;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+enum ocfs2_ref_rec_contig {
+	REF_CONTIG_NONE = 0,
+	REF_CONTIG_LEFT,
+	REF_CONTIG_RIGHT,
+	REF_CONTIG_LEFTRIGHT,
+};
+
+static inline enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
+				    int index)
+{
+	if ((rb->rf_records.rl_recs[index].r_refcount =+	   
rb->rf_records.rl_recs[index + 1].r_refcount) &&
+	    (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
+	    le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) =+	   
le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
+		return REF_CONTIG_RIGHT;
+
+	return REF_CONTIG_NONE;
+}
+
+static enum ocfs2_ref_rec_contig
+	ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
+				  int index)
+{
+	enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
+		ret = ocfs2_refcount_rec_adjacent(rb, index);
+
+	if (index > 0) {
+		enum ocfs2_ref_rec_contig tmp;
+
+		tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
+
+		if (tmp == REF_CONTIG_RIGHT) {
+			if (ret == REF_CONTIG_RIGHT)
+				ret = REF_CONTIG_LEFTRIGHT;
+			else
+				ret = REF_CONTIG_LEFT;
+		}
+	}
+
+	return ret;
+}
+
+static inline void
+	ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
+				       int index)
+{
+	BUG_ON(rb->rf_records.rl_recs[index].r_refcount !+	      
rb->rf_records.rl_recs[index+1].r_refcount);
+
+	le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
+		     le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
+
+	if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
+		memmove(&rb->rf_records.rl_recs[index + 1],
+			&rb->rf_records.rl_recs[index + 2],
+			sizeof(struct ocfs2_refcount_rec) *
+			(le16_to_cpu(rb->rf_records.rl_used) - index - 2));
+
+	memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) -
1],
+	       0, sizeof(struct ocfs2_refcount_rec));
+	le16_add_cpu(&rb->rf_records.rl_used, -1);
+}
+
+/*
+ * Merge the refcount rec if we are contiguous with the adjacent recs.
+ */
+static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
+				     int index)
+{
+	enum ocfs2_ref_rec_contig contig +				ocfs2_refcount_rec_contig(rb, index);
+
+	if (contig == REF_CONTIG_NONE)
+		return;
+
+	if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
+		BUG_ON(index == 0);
+		index--;
+	}
+
+	ocfs2_rotate_refcount_rec_left(rb, index);
+
+	if (contig == REF_CONTIG_LEFTRIGHT)
+		ocfs2_rotate_refcount_rec_left(rb, index);
+}
+
+static int ocfs2_change_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_leaf_bh,
+				     int index, int change)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "change index %d, old count %u, change %d\n", index,
+	     le32_to_cpu(rec->r_refcount), change);
+	le32_add_cpu(&rec->r_refcount, change);
+
+	ocfs2_refcount_rec_merge(rb, index);
+
+	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static int ocfs2_expand_inline_ref_root(handle_t *handle,
+					struct ocfs2_caching_info *ci,
+					struct buffer_head *ref_root_bh,
+					struct buffer_head **ref_leaf_bh,
+					struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+	struct ocfs2_refcount_block *root_rb +			(struct ocfs2_refcount_block
*)ref_root_bh->b_data;
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Initialize ocfs2_refcount_block.
+	 * It should contain the same information as the old root.
+	 * so just memcpy it and change the corresponding field.
+	 */
+	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
+
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_cpos = cpu_to_le32(0);
+	ocfs2_journal_dirty(handle, new_bh);
+
+	/* Now change the root. */
+	memset(&root_rb->rf_list, 0, sb->s_blocksize -
+	       offsetof(struct ocfs2_refcount_block, rf_list));
+	root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
+	root_rb->rf_clusters = cpu_to_le32(1);
+	root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
+	root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+	root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+	mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
+	     le16_to_cpu(new_rb->rf_records.rl_used));
+
+	*ref_leaf_bh = new_bh;
+	new_bh = NULL;
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
+					   struct ocfs2_refcount_rec *next)
+{
+	if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <+	
ocfs2_get_ref_rec_low_cpos(next))
+		return 1;
+
+	return 0;
+}
+
+static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
+	u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
+{
+	const struct ocfs2_refcount_rec *l = a, *r = b;
+	u64 l_cpos = le64_to_cpu(l->r_cpos);
+	u64 r_cpos = le64_to_cpu(r->r_cpos);
+
+	if (l_cpos > r_cpos)
+		return 1;
+	if (l_cpos < r_cpos)
+		return -1;
+	return 0;
+}
+
+static void swap_refcount_rec(void *a, void *b, int size)
+{
+	struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+
+	tmp = *l;
+	memcpy(l, r, sizeof(struct ocfs2_refcount_rec));
+	memcpy(r, &tmp, sizeof(struct ocfs2_refcount_rec));
+}
+
+/*
+ * The refcount cpos are ordered by their 64bit cpos,
+ * But we will use the low 32 bit to be the e_cpos in the b-tree.
+ * So we need to make sure that this pos isn't intersected with others.
+ *
+ * Note: The refcount block is already sorted by their low 32 bit cpos,
+ *       So just try the middle pos first, and we will exit when we find
+ *       the good position.
+ */
+static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
+					 u32 *split_pos, int *split_index)
+{
+	int num_used = le16_to_cpu(rl->rl_used);
+	int delta, middle = num_used / 2;
+
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle - delta - 1],
+					&rl->rl_recs[middle - delta])) {
+			*split_index = middle - delta;
+			goto out;
+		}
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == num_used)
+			continue;
+
+		/* Now try delta past middle */
+		if (ocfs2_refcount_rec_no_intersect(
+					&rl->rl_recs[middle + delta],
+					&rl->rl_recs[middle + delta + 1])) {
+			*split_index = middle + delta + 1;
+			goto out;
+		}
+	}
+
+	/* We can't find a suitable split pos in the refcount block. */
+	return -ENOSPC;
+out:
+
+	*split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
+	return 0;
+}
+
+static int ocfs2_insert_leaf_refcount_block(handle_t *handle,
+					    struct ocfs2_caching_info *ci,
+					    struct buffer_head *ref_root_bh,
+					    struct buffer_head *ref_leaf_bh,
+					    struct buffer_head *new_bh,
+					    struct ocfs2_alloc_context *meta_ac)
+{
+	int ret, split_index = 0, num_moved;
+	u32 cpos = 0;
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_block *new_rb +			(struct ocfs2_refcount_block
*)new_bh->b_data;
+	struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
+	struct ocfs2_extent_tree ref_et;
+
+	/*
+	 * XXX: Improvement later.
+	 * If we know all the high 32 bit cpos is the same, no need to sort.
+	 *
+	 * In order to make the whole process safe, we do:
+	 * 1. sort the entries by their low 32 bit cpos first so that we can
+	 *    find the split cpos easily.
+	 * 2. call ocfs2_insert_extent to insert the new refcount block.
+	 * 3. move the refcount rec to the new block.
+	 * 4. sort the entries by their 64 bit cpos.
+	 * 5. dirty the new_rb and rb.
+	 */
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
+
+	ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
+
+	ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
+
+	mlog(0, "insert new leaf block %llu at %u, split_index %d\n",
+	     (unsigned long long)new_bh->b_blocknr, cpos, split_index);
+
+	ret = ocfs2_insert_extent(handle, &ref_et, cpos, new_bh->b_blocknr,
+				  1, 0, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	new_rb->rf_cpos = cpu_to_le32(cpos);
+
+	num_moved = le16_to_cpu(rl->rl_used) - split_index;
+	memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	memset(&rl->rl_recs[split_index], 0,
+	       num_moved * sizeof(struct ocfs2_refcount_rec));
+
+	le16_add_cpu(&rl->rl_used, -num_moved);
+	new_rl->rl_used = cpu_to_le32(num_moved);
+
+	sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+	sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
+	     sizeof(struct ocfs2_refcount_rec),
+	     cmp_refcount_rec_by_cpos, swap_refcount_rec);
+
+out:
+	return ret;
+}
+
+static int ocfs2_new_leaf_refcount_block(handle_t *handle,
+					 struct ocfs2_caching_info *ci,
+					 struct buffer_head *ref_root_bh,
+					 struct buffer_head *ref_leaf_bh,
+					 struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *root_rb +			(struct ocfs2_refcount_block
*)ref_root_bh->b_data;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_refcount_block *new_rb;
+
+	BUG_ON(!ref_root_bh);
+	BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_bh = sb_getblk(sb, blkno);
+	if (new_bh == NULL) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Initialize ocfs2_refcount_block. */
+	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
+	memset(new_rb, 0, sb->s_blocksize);
+	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
+	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+	new_rb->rf_blkno = cpu_to_le64(blkno);
+	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+	new_rb->rf_records.rl_count +			
cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+
+	ret = ocfs2_insert_leaf_refcount_block(handle, ci, ref_root_bh,
+					       ref_leaf_bh, new_bh, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_journal_dirty(handle, ref_leaf_bh);
+	ocfs2_journal_dirty(handle, new_bh);
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int ocfs2_expand_refcount_tree(handle_t *handle,
+				      struct ocfs2_caching_info *ci,
+				      struct buffer_head *ref_root_bh,
+				      struct buffer_head *ref_leaf_bh,
+				      struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct buffer_head *expand_bh = NULL;
+
+	if (ref_root_bh == ref_leaf_bh) {
+		/*
+		 * the old root bh hasn't been expanded to a b-tree,
+		 * so expand it first.
+		 */
+		ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
+						   &expand_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		expand_bh = ref_leaf_bh;
+		get_bh(expand_bh);
+	}
+
+
+	/* Now add a new refcount block into the tree.*/
+	ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
+					    expand_bh, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(expand_bh);
+	return ret;
+}
+
+static int ocfs2_insert_refcount_rec(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     struct buffer_head *ref_leaf_bh,
+				     struct ocfs2_refcount_rec *rec,
+				     int index,
+				     struct ocfs2_alloc_context *meta_ac)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_LEAF_FL));
+
+	if (rf_list->rl_used == rf_list->rl_count) {
+		u64 cpos = le64_to_cpu(rec->r_cpos);
+		u32 len = le32_to_cpu(rec->r_clusters);
+
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, NULL, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (index < le16_to_cpu(rf_list->rl_used))
+		memmove(&rf_list->rl_recs[index + 1],
+			&rf_list->rl_recs[index],
+			(le16_to_cpu(rf_list->rl_used) - index) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	rf_list->rl_recs[index] = *rec;
+
+	le16_add_cpu(&rf_list->rl_used, 1);
+
+	ocfs2_refcount_rec_merge(rb, index);
+
+	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+/*
+ * Split the refcount_rec indexed by "index" in ref_leaf_bh.
+ * This is much simple than our b-tree code.
+ * split_rec is the new refcount rec we want to insert.
+ * If split_rec->r_refcount > 0, we are changing the refcount(in case we
+ * increase refcount or decrease a refcount to non-zero).
+ * If split_rec->r_refcount == 0, we are punching a hole in current refcount
+ * rec( in case we decrease a refcount to zero).
+ */
+static int ocfs2_split_refcount_rec(handle_t *handle,
+				    struct ocfs2_caching_info *ci,
+				    struct buffer_head *ref_root_bh,
+				    struct buffer_head *ref_leaf_bh,
+				    struct ocfs2_refcount_rec *split_rec,
+				    int index,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, recs_need;
+	u32 len;
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
+	struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
+	struct ocfs2_refcount_rec *tail_rec = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	BUG_ON(!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_LEAF_FL));
+
+	mlog(0, "original r_pos %llu, cluster %u, split %llu, c %u\n",
+	     le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
+	     le64_to_cpu(split_rec->r_cpos),
+	     le32_to_cpu(split_rec->r_clusters));
+
+	/*
+	 * If we just need to split the header or tail clusters,
+	 * no more recs are needed, just split is OK.
+	 * Otherwise we at least need one new recs.
+	 */
+	if (!split_rec->r_refcount &&
+	    (split_rec->r_cpos == orig_rec->r_cpos ||
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) =+	    
le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need = 0;
+	else
+		recs_need = 1;
+
+	/*
+	 * We need one more rec if we split in the middle and the new rec have
+	 * some refcount in it.
+	 */
+	if (split_rec->r_refcount &&
+	    (split_rec->r_cpos != orig_rec->r_cpos &&
+	     le64_to_cpu(split_rec->r_cpos) +
+	     le32_to_cpu(split_rec->r_clusters) !+	    
le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
+		recs_need++;
+
+	if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
+		struct ocfs2_refcount_rec tmp_rec;
+		u64 cpos = le64_to_cpu(orig_rec->r_cpos);
+		len = le32_to_cpu(orig_rec->r_clusters);
+		mlog(0, "expand refcount tree.\n");
+		ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
+						 ref_leaf_bh, meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &tmp_rec, &index,
+					     &new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ref_leaf_bh = new_bh;
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+		rf_list = &rb->rf_records;
+		orig_rec = &rf_list->rl_recs[index];
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (index != le16_to_cpu(rf_list->rl_used) - 1)
+		memmove(&rf_list->rl_recs[index + 1 + recs_need],
+			&rf_list->rl_recs[index + 1],
+			(le16_to_cpu(rf_list->rl_used) - index - 1) *
+			 sizeof(struct ocfs2_refcount_rec));
+
+	len = (le64_to_cpu(orig_rec->r_cpos) +
+	      le32_to_cpu(orig_rec->r_clusters)) -
+	      (le64_to_cpu(split_rec->r_cpos) +
+	      le32_to_cpu(split_rec->r_clusters));
+
+	if (len) {
+		tail_rec = &rf_list->rl_recs[index + recs_need];
+
+		memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
+		le64_add_cpu(&tail_rec->r_cpos,
+			     le32_to_cpu(tail_rec->r_clusters) - len);
+		tail_rec->r_clusters = le32_to_cpu(len);
+	}
+
+	if (tail_rec != orig_rec && split_rec->r_cpos !=
orig_rec->r_cpos) {
+		len = le64_to_cpu(split_rec->r_cpos) -
+		      le64_to_cpu(orig_rec->r_cpos);
+		orig_rec->r_clusters = cpu_to_le32(len);
+		index++;
+	}
+
+	le16_add_cpu(&rf_list->rl_used, recs_need);
+
+	if (split_rec->r_refcount) {
+		rf_list->rl_recs[index] = *split_rec;
+		ocfs2_refcount_rec_merge(rb, index);
+	}
+
+	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_bh);
+	return ret;
+}
+
+static int __ocfs2_increase_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0, index;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_refcount_rec rec;
+	unsigned int set_len = 0;
+
+	mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
+	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+	     (unsigned long long)cpos, len);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		set_len = le32_to_cpu(rec.r_clusters);
+
+		/*
+		 * Here we may meet with 3 situations:
+		 *
+		 * 1. If we find an already existing record, and the length
+		 *    is the same, cool, we just need to increase the r_count
+		 *    and it is OK.
+		 * 2. If we find a hole, just insert it with r_count = 1.
+		 * 3. If we are in the middle of one extent record, split
+		 *    it.
+		 */
+		if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
+		    set_len <= len) {
+			mlog(0, "increase refcount rec, start %llu, len %u, "
+			     "count %u\n", (unsigned long long)cpos, set_len,
+			     le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_change_refcount_rec(handle, ci,
+							ref_leaf_bh, index, 1);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else if (!rec.r_refcount) {
+			rec.r_refcount = cpu_to_le32(1);
+
+			mlog(0, "insert refcount rec, start %llu, len %u\n",
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len);
+			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
+							ref_leaf_bh,
+							&rec, index, meta_ac);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else  {
+			set_len = min((u64)(cpos + len),
+				      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
+			rec.r_cpos = cpu_to_le64(cpos);
+			rec.r_clusters = cpu_to_le32(set_len);
+			le32_add_cpu(&rec.r_refcount, 1);
+
+			mlog(0, "split refcount rec, start %llu, "
+			     "len %u, count %u\n",
+			     (unsigned long long)le64_to_cpu(rec.r_cpos),
+			     set_len, le32_to_cpu(rec.r_refcount));
+			ret = ocfs2_split_refcount_rec(handle, ci,
+						       ref_root_bh, ref_leaf_bh,
+						       &rec, index,
+						       meta_ac, dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += set_len;
+		len -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 11/39] ocfs2: Add support of decrementing refcount for delete.

Given a physical cpos and length, decrement the refcount
in the tree. If the refcount for any portion of the extent goes
to zero, that portion is queued for freeing.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/refcounttree.c |  227 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/refcounttree.h |    4 +
 2 files changed, 229 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index bc4bfb8..58b2c12 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -603,6 +603,10 @@ static void ocfs2_refcount_rec_merge(struct
ocfs2_refcount_block *rb,
 		ocfs2_rotate_refcount_rec_left(rb, index);
 }
 
+/*
+ * Change the refcount indexed by "index" in ref_bh.
+ * If refcount reaches 0, remove it.
+ */
 static int ocfs2_change_refcount_rec(handle_t *handle,
 				     struct ocfs2_caching_info *ci,
 				     struct buffer_head *ref_leaf_bh,
@@ -611,7 +615,8 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
 	int ret;
 	struct ocfs2_refcount_block *rb  			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
-	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+	struct ocfs2_refcount_list *rl = &rb->rf_records;
+	struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
 
 	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
@@ -624,7 +629,18 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
 	     le32_to_cpu(rec->r_refcount), change);
 	le32_add_cpu(&rec->r_refcount, change);
 
-	ocfs2_refcount_rec_merge(rb, index);
+	if (!rec->r_refcount) {
+		if (index != le16_to_cpu(rl->rl_used) - 1) {
+			memmove(rec, rec + 1,
+				(le16_to_cpu(rl->rl_used) - index - 1) *
+				sizeof(struct ocfs2_refcount_rec));
+			memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
+			       0, sizeof(struct ocfs2_refcount_rec));
+		}
+
+		le16_add_cpu(&rl->rl_used, -1);
+	} else
+		ocfs2_refcount_rec_merge(rb, index);
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
 	if (ret)
@@ -1288,3 +1304,210 @@ out:
 	brelse(ref_leaf_bh);
 	return ret;
 }
+
+static int ocfs2_remove_refcount_extent(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
+	struct ocfs2_extent_tree et;
+
+	BUG_ON(rb->rf_records.rl_used);
+
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+	ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
+				  1, meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	le32_add_cpu(&rb->rf_clusters, -1);
+
+	/*
+	 * check whether we need to restore the root refcount block if
+	 * there is no leaf extent block at atll.
+	 */
+	if (!rb->rf_list.l_next_free_rec) {
+		BUG_ON(rb->rf_clusters);
+
+		mlog(0, "reset refcount block root %llu to a leaf block.\n",
+		     (unsigned long long)ref_root_bh->b_blocknr);
+
+		memset(&rb->rf_records, 0, sb->s_blocksize -
+		       offsetof(struct ocfs2_refcount_block, rf_records));
+		rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
+		rb->rf_records.rl_count +				cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
+	}
+
+	ocfs2_journal_dirty(handle, ref_root_bh);
+
+out:
+	return ret;
+}
+
+static int ocfs2_decrease_refcount_rec(handle_t *handle,
+				struct ocfs2_caching_info *ci,
+				struct buffer_head *ref_root_bh,
+				struct buffer_head *ref_leaf_bh,
+				int index, u64 cpos, unsigned int len,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
+	struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
+
+	BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
+	BUG_ON(cpos + len >
+	       le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+
+	if (cpos == le64_to_cpu(rec->r_cpos) && cpos + len =+	   
le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters))
+		ret = ocfs2_change_refcount_rec(handle, ci,
+						ref_leaf_bh, index, -1);
+	else {
+		struct ocfs2_refcount_rec split = *rec;
+		split.r_cpos = cpu_to_le64(cpos);
+		split.r_clusters = cpu_to_le32(len);
+
+		le32_add_cpu(&split.r_refcount, -1);
+
+		mlog(0, "split refcount rec, start %llu, "
+		     "len %u, count %u, original start %llu, len %u\n",
+		     (unsigned long long)le64_to_cpu(split.r_cpos),
+		     len, le32_to_cpu(split.r_refcount),
+		     (unsigned long long)le64_to_cpu(rec->r_cpos),
+		     le32_to_cpu(rec->r_clusters));
+		ret = ocfs2_split_refcount_rec(handle, ci,
+					       ref_root_bh, ref_leaf_bh,
+					       &split, index,
+					       meta_ac, dealloc);
+	}
+
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Remove the leaf refcount block if it contains no refcount record. */
+	if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
+		ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
+						   ref_leaf_bh, meta_ac,
+						   dealloc);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	return ret;
+}
+
+static int __ocfs2_decrease_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0, index = 0;
+	struct ocfs2_refcount_rec rec;
+	unsigned int r_count = 0, r_len;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	mlog(0, "Tree owner %llu, decrease refcount start %llu, len %u\n",
+	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
+	     (unsigned long long)cpos, len);
+
+	while (len) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, len, &rec, &index,
+					     &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		r_count = le32_to_cpu(rec.r_refcount);
+		BUG_ON(r_count == 0);
+
+		r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - cpos;
+
+		ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
+						  ref_leaf_bh, index,
+						  cpos, r_len,
+						  meta_ac, dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (le32_to_cpu(rec.r_refcount) == 1) {
+			ret = ocfs2_cache_cluster_dealloc(dealloc,
+					  ocfs2_clusters_to_blocks(sb, cpos),
+							  r_len);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += r_len;
+		len -= r_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di->i_refcount_loc);
+
+	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+					le64_to_cpu(di->i_refcount_loc),
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = __ocfs2_decrease_refcount(handle, INODE_CACHE(inode), ref_root_bh,
+					cpos, len, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9f4bdac..92fe116 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -23,4 +23,8 @@ int ocfs2_set_refcount_tree(struct inode *inode,
 			    u64 blkno);
 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
 
+int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh,
+			    handle_t *handle, u32 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 12/39] ocfs2: Add functions for extents refcounted.

Add function ocfs2_mark_extent_refcounted which can mark
an extent refcounted.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c        |   12 ++++++------
 fs/ocfs2/alloc.h        |    6 ++++++
 fs/ocfs2/ocfs2.h        |    7 +++++++
 fs/ocfs2/refcounttree.c |   39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f8c41d6..bbea71c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5053,12 +5053,12 @@ out:
 	return ret;
 }
 
-static int ocfs2_change_extent_flag(handle_t *handle,
-				    struct ocfs2_extent_tree *et,
-				    u32 cpos, u32 len, u32 phys,
-				    struct ocfs2_alloc_context *meta_ac,
-				    struct ocfs2_cached_dealloc_ctxt *dealloc,
-				    int new_flags, int clear_flags)
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags)
 {
 	int ret, index;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3f43489..10a7b58 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -128,6 +128,12 @@ int ocfs2_mark_extent_written(struct inode *inode,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_change_extent_flag(handle_t *handle,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 len, u32 phys,
+			     struct ocfs2_alloc_context *meta_ac,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc,
+			     int new_flags, int clear_flags);
 int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len,
 			struct ocfs2_alloc_context *meta_ac,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f02b939..62bee1e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -484,6 +484,13 @@ static inline void ocfs2_add_links_count(struct
ocfs2_dinode *di, int n)
 	ocfs2_set_links_count(di, links);
 }
 
+static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 58b2c12..8dc5f3c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1511,3 +1511,42 @@ out:
 	brelse(ref_root_bh);
 	return ret;
 }
+
+/*
+ * Mark the already-existing extent at cpos as refcounted for len clusters.
+ * This adds the refcount extent flag.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+static int ocfs2_mark_extent_refcounted(struct inode *inode,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle, u32 cpos,
+				u32 len, u32 phys,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+
+	mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster
%u\n",
+	     inode->i_ino, cpos, len, phys);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_change_extent_flag(handle, et, cpos,
+				       len, phys, meta_ac, dealloc,
+				       OCFS2_EXT_REFCOUNTED, 0);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 13/39] ocfs2: Hook 'Decrement refcount for delete'.

Add 'Decrement refcount for delete' in to the normal truncate
process. So for a refcounted extent record, call refcount rec
decrementation instead of cluster free.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c        |   55 ++++++++++++--
 fs/ocfs2/journal.h      |    4 +
 fs/ocfs2/refcounttree.c |  195 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    6 ++
 4 files changed, 253 insertions(+), 7 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index bbea71c..e8ff5f7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6547,7 +6547,7 @@ out:
  */
 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
 			   handle_t *handle, struct ocfs2_truncate_context *tc,
-			   u32 clusters_to_del, u64 *delete_start)
+			   u32 clusters_to_del, u64 *delete_start, u8 *flags)
 {
 	int ret, i, index = path->p_tree_depth;
 	u32 new_edge = 0;
@@ -6557,6 +6557,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct
ocfs2_path *path,
 	struct ocfs2_extent_rec *rec;
 
 	*delete_start = 0;
+	*flags = 0;
 
 	while (index >= 0) {
 		bh = path->p_node[index].bh;
@@ -6644,6 +6645,7 @@ find_tail_record:
 			*delete_start = le64_to_cpu(rec->e_blkno)
 				+ ocfs2_clusters_to_blocks(inode->i_sb,
 					le16_to_cpu(rec->e_leaf_clusters));
+			*flags = rec->e_flags;
 
 			/*
 			 * If it's now empty, remove this record.
@@ -6743,7 +6745,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 			     struct buffer_head *fe_bh,
 			     handle_t *handle,
 			     struct ocfs2_truncate_context *tc,
-			     struct ocfs2_path *path)
+			     struct ocfs2_path *path,
+			     struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
 	struct ocfs2_dinode *fe;
@@ -6751,6 +6754,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	struct ocfs2_extent_list *el;
 	struct buffer_head *last_eb_bh = NULL;
 	u64 delete_blk = 0;
+	u8 rec_flags;
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
@@ -6806,7 +6810,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 
 	status = ocfs2_trim_tree(inode, path, handle, tc,
-				 clusters_to_del, &delete_blk);
+				 clusters_to_del, &delete_blk, &rec_flags);
 	if (status) {
 		mlog_errno(status);
 		goto bail;
@@ -6838,8 +6842,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	}
 
 	if (delete_blk) {
-		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-						   clusters_to_del);
+		if (rec_flags & OCFS2_EXT_REFCOUNTED)
+			status = ocfs2_decrease_refcount(inode, fe_bh, handle,
+					ocfs2_blocks_to_clusters(osb->sb,
+								 delete_blk),
+					clusters_to_del, meta_ac,
+					&tc->tc_dealloc);
+		else
+			status = ocfs2_truncate_log_append(osb, handle,
+							   delete_blk,
+							   clusters_to_del);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -7257,11 +7269,13 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 {
 	int status, i, credits, tl_sem = 0;
 	u32 clusters_to_del, new_highest_cpos, range;
+	u64 blkno = 0;
 	struct ocfs2_extent_list *el;
 	handle_t *handle = NULL;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
 
 	mlog_entry_void();
 
@@ -7287,6 +7301,8 @@ start:
 		goto bail;
 	}
 
+	credits = 0;
+
 	/*
 	 * Truncate always works against the rightmost tree branch.
 	 */
@@ -7327,10 +7343,15 @@ start:
 		clusters_to_del = 0;
 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
 	} else if (range > new_highest_cpos) {
 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
 				  new_highest_cpos;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
+			ocfs2_clusters_to_blocks(inode->i_sb,
+				ocfs2_rec_clusters(el, &el->l_recs[i]) -
+				clusters_to_del);
 	} else {
 		status = 0;
 		goto bail;
@@ -7339,6 +7360,18 @@ start:
 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
 
+	if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED &&
clusters_to_del) {
+		status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
+							       blkno,
+							       clusters_to_del,
+							       &credits,
+							       &meta_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	mutex_lock(&tl_inode->i_mutex);
 	tl_sem = 1;
 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
@@ -7352,7 +7385,7 @@ start:
 		}
 	}
 
-	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
+	credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
 						(struct ocfs2_dinode *)fe_bh->b_data,
 						el);
 	handle = ocfs2_start_trans(osb, credits);
@@ -7364,7 +7397,7 @@ start:
 	}
 
 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
-				   tc, path);
+				   tc, path, meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -7378,6 +7411,11 @@ start:
 
 	ocfs2_reinit_path(path, 1);
 
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+
 	/*
 	 * The check above will catch the case where we've truncated
 	 * away all allocation.
@@ -7398,6 +7436,9 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
 
 	ocfs2_free_path(path);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index bfd5942..dbd7888 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -485,6 +485,10 @@ static inline int ocfs2_calc_dxi_expand_credits(struct
super_block *sb)
 #define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
 					    + OCFS2_SUBALLOC_ALLOC + 1)
 #define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/* 2 metadata alloc, 2 new blocks and root refcount block */
+#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
+
 /*
  * Please note that the caller must make sure that root_el is the root
  * of extent tree. So for an inode, it should be &fe->id2.i_list.
Otherwise
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8dc5f3c..b56d083 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1550,3 +1550,198 @@ static int ocfs2_mark_extent_refcounted(struct inode
*inode,
 out:
 	return ret;
 }
+
+/*
+ * Given some contiguous physical clusters, calculate what we need
+ * for modifying their refcount.
+ */
+static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
+					    struct ocfs2_caching_info *ci,
+					    struct buffer_head *ref_root_bh,
+					    u64 start_cpos,
+					    u32 clusters,
+					    int *meta_add,
+					    int *credits)
+{
+	int ret = 0, index, ref_blocks = 0, recs_add = 0;
+	u64 cpos = start_cpos;
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
+	u32 len;
+
+	mlog(0, "start_cpos %llu, clusters %u\n",
+	     (unsigned long long)start_cpos, clusters);
+	while (clusters) {
+		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
+					     cpos, clusters, &rec,
+					     &index, &ref_leaf_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+		mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
+		     "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
+		     recs_add, (unsigned long long)cpos, clusters,
+		     (unsigned long long)le64_to_cpu(rec.r_cpos),
+		     le32_to_cpu(rec.r_clusters),
+		     le32_to_cpu(rec.r_refcount), index);
+
+		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
+			  le32_to_cpu(rec.r_clusters)) - cpos;
+		/*
+		 * If the refcount rec already exist, cool. We just need
+		 * to check whether there is a split.
+		 * If we will insert one, check whether we have space.
+		 *
+		 * We record all the records which will be inserted to the
+		 * same refcount block, so that we can tell exactly whether
+		 * we need a new refcount block or not.
+		 */
+		if (rec.r_refcount) {
+			/* Check whether we need a split at the beginning. */
+			if (cpos == start_cpos &&
+			    cpos != le64_to_cpu(rec.r_cpos)) {
+				recs_add++;
+
+				if (le64_to_cpu(rb->rf_records.rl_used) +
+				    recs_add >
+				    le16_to_cpu(rb->rf_records.rl_count))
+					ref_blocks++;
+			}
+
+			/* Check whether we need a split in the end. */
+			if (cpos + clusters > le64_to_cpu(rec.r_cpos) +
+			    le32_to_cpu(rec.r_clusters)) {
+				recs_add++;
+
+				if (le64_to_cpu(rb->rf_records.rl_used) +
+				    recs_add >
+				    le16_to_cpu(rb->rf_records.rl_count))
+					ref_blocks++;
+			}
+		} else {
+			recs_add++;
+
+			if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
+			    le16_to_cpu(rb->rf_records.rl_count))
+				ref_blocks++;
+		}
+
+		if (ref_leaf_bh != prev_bh) {
+			mlog(0, "ref block %llu, rl_used %u, rl_count %u\n",
+			     (unsigned long long)ref_leaf_bh->b_blocknr,
+			     le16_to_cpu(rb->rf_records.rl_used),
+			     le16_to_cpu(rb->rf_records.rl_count));
+			recs_add = 0;
+			*credits += 1;
+			brelse(prev_bh);
+			prev_bh = ref_leaf_bh;
+			ref_leaf_bh = NULL;
+		}
+
+		clusters -= len;
+		cpos += len;
+	}
+
+	mlog(0, "we need ref_blocks %d\n", ref_blocks);
+	*meta_add += ref_blocks;
+
+	if (!ref_blocks)
+		goto out;
+
+	/*
+	 * So we may need ref_blocks to insert into the tree.
+	 * That also means we need to change the b-tree and add that number
+	 * of records since we never merge them.
+	 * We need one more block for expansion since the new created leaf
+	 * block is also full and needs split.
+	 */
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_LEAF_FL) {
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+		*meta_add += 1;
+	} else {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+		*credits += ocfs2_calc_extend_credits(sb,
+						      et.et_root_el,
+						      ref_blocks);
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	brelse(prev_bh);
+	return ret;
+}
+
+/*
+ * For refcount tree, we will decrease some contiguous clusters
+ * refcount count, so just go through it to see how many blocks
+ * we gonna touch and whether we need to create new blocks.
+ *
+ * Normally the refcount blocks store these refcount should be
+ * continguous also, so that we can get the number easily.
+ * As for meta_ac, we will at most add split 2 refcount record and
+ * 2 more refcount block, so just check it in a rough way.
+ */
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  struct buffer_head *di_bh,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, ref_blocks = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct buffer_head *ref_root_bh = NULL;
+	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		ret = -EROFS;
+		goto out;
+	}
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di->i_refcount_loc);
+
+	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+					le64_to_cpu(di->i_refcount_loc),
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       INODE_CACHE(inode),
+					       ref_root_bh,
+					       start_cpos, clusters,
+					       &ref_blocks, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "reserve new metadata %d, credits = %d\n",
+	     ref_blocks, *credits);
+
+	if (ref_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+							ref_blocks, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 92fe116..0fdf726 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -27,4 +27,10 @@ int ocfs2_decrease_refcount(struct inode *inode, struct
buffer_head *di_bh,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
+					  struct buffer_head *di_bh,
+					  u64 phys_blkno,
+					  u32 clusters,
+					  int *credits,
+					  struct ocfs2_alloc_context **meta_ac);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 14/39] ocfs2: Add CoW support.

When replacing the old refcounted extent record, I don't
remove the old extent record first and then insert the new one.
Because during the tree manipulation(e.g, ocfs2_remove_extent),
we often need to call ocfs2_extend_trans which may restart our
transcation. So if we crash right after the removing and before
the inserting, we will lost the data.
So the whole process will be:
1. If we are replacing the whole extent record, just copy the data
   and replace e_blkno.
2. If we are split the extent record, just initialized the data
   and then call ocfs2_split_extent directly, the tree code is
   modified so that it can handle it.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c        |   25 ++-
 fs/ocfs2/alloc.h        |    5 +
 fs/ocfs2/aops.c         |   49 ++++-
 fs/ocfs2/refcounttree.c |  678 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    2 +
 5 files changed, 747 insertions(+), 12 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e8ff5f7..365b96e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6871,9 +6871,9 @@ static int ocfs2_zero_func(handle_t *handle, struct
buffer_head *bh)
 	return 0;
 }
 
-static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
-				     unsigned int from, unsigned int to,
-				     struct page *page, int zero, u64 *phys)
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys)
 {
 	int ret, partial = 0;
 
@@ -6941,20 +6941,16 @@ out:
 		ocfs2_unlock_and_free_pages(pages, numpages);
 }
 
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
-				struct page **pages, int *num)
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num)
 {
 	int numpages, ret = 0;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
 	loff_t last_page_bytes;
 
 	BUG_ON(start > end);
 
-	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !-	       (end - 1)
>> OCFS2_SB(sb)->s_clustersize_bits);
-
 	numpages = 0;
 	last_page_bytes = PAGE_ALIGN(end);
 	index = start >> PAGE_CACHE_SHIFT;
@@ -6982,6 +6978,17 @@ out:
 	return ret;
 }
 
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+				struct page **pages, int *num)
+{
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !+	       (end - 1)
>> OCFS2_SB(sb)->s_clustersize_bits);
+
+	return ocfs2_grab_pages(inode, start, end, pages, num);
+}
+
 /*
  * Zero the area past i_size but still within an allocated
  * cluster. This avoids exposing nonzero data on subsequent file
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 10a7b58..fc20fbc 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -268,6 +268,11 @@ static inline int ocfs2_is_empty_extent(struct
ocfs2_extent_rec *rec)
 	return !rec->e_leaf_clusters;
 }
 
+int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
+		     struct page **pages, int *num);
+void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+			      unsigned int from, unsigned int to,
+			      struct page *page, int zero, u64 *phys);
 /*
  * Structures which describe a path through a btree, and functions to
  * manipulate them.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index db6afb9..dd66a24 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -1410,18 +1411,29 @@ static void ocfs2_set_target_boundaries(struct
ocfs2_super *osb,
 	}
 }
 
+static inline void ocfs2_clear_write_desc(struct ocfs2_write_ctxt *wc)
+{
+	memset(&wc->w_desc, 0,
+	       sizeof(struct ocfs2_write_cluster_desc) * wc->w_clen);
+}
 /*
  * Populate each single-cluster write descriptor in the write context
  * with information about the i/o to be done.
+ * If we encountered a refcounted cluster, break the process and return
+ * the refcounted start cpos.
  *
  * Returns the number of clusters that will have to be allocated, as
  * well as a worst case estimate of the number of extent records that
  * would have to be created during a write to an unwritten region.
+ *
+ * If we find a refcounted record, return directly with refcounted_cpos
+ * set as the position.
  */
 static int ocfs2_populate_write_desc(struct inode *inode,
 				     struct ocfs2_write_ctxt *wc,
 				     unsigned int *clusters_to_alloc,
-				     unsigned int *extents_to_split)
+				     unsigned int *extents_to_split,
+				     unsigned int *refcounted_cpos)
 {
 	int ret;
 	struct ocfs2_write_cluster_desc *desc;
@@ -1432,6 +1444,7 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 
 	*clusters_to_alloc = 0;
 	*extents_to_split = 0;
+	*refcounted_cpos = UINT_MAX;
 
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
@@ -1448,6 +1461,11 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 				goto out;
 			}
 
+			if (ext_flags & OCFS2_EXT_REFCOUNTED) {
+				*refcounted_cpos = desc->c_cpos;
+				ret = -ETXTBSY;
+				goto out;
+			}
 			/*
 			 * Assume worst case - that we're writing in
 			 * the middle of the extent.
@@ -1655,6 +1673,7 @@ int ocfs2_write_begin_nolock(struct address_space
*mapping,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
+	unsigned int refcounted_cpos, write_len;
 
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
@@ -1682,12 +1701,36 @@ int ocfs2_write_begin_nolock(struct address_space
*mapping,
 	}
 
 	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
-					&extents_to_split);
-	if (ret) {
+					&extents_to_split, &refcounted_cpos);
+	if (ret && ret != -ETXTBSY) {
 		mlog_errno(ret);
 		goto out;
 	}
 
+	if (ret == -ETXTBSY) {
+		BUG_ON(refcounted_cpos == UINT_MAX);
+		write_len = wc->w_clen - (refcounted_cpos - wc->w_cpos);
+
+		ret = ocfs2_refcount_cow(inode, di_bh,
+					 refcounted_cpos, write_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* reinitialize write_desc and populate it again. */
+		ocfs2_clear_write_desc(wc);
+		ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+						&extents_to_split,
+						&refcounted_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(refcounted_cpos != UINT_MAX);
+	}
+
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
 	/*
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b56d083..b71753b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -31,6 +31,22 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "extent_map.h"
+#include "aops.h"
+
+struct ocfs2_cow_context {
+	struct inode *inode;
+	struct ocfs2_extent_tree di_et;
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct buffer_head **bhs;
+	struct page **cow_pages;
+	int num_pages;
+	u32 cow_start;
+	u32 cow_len;
+};
 
 static int ocfs2_validate_refcount_block(struct super_block *sb,
 					 struct buffer_head *bh)
@@ -1745,3 +1761,665 @@ out:
 	brelse(ref_root_bh);
 	return ret;
 }
+
+#define	MAX_COW_BYTES	1048576
+/*
+ * Calculate out the start and number of virtual clusters we need to to CoW.
+ *
+ * cpos is vitual start cluster position we want to do CoW in a
+ * file and write_len is the cluster length.
+ *
+ * Normal we will start CoW from the beginning of extent record cotaining cpos.
+ * And We will try to Cow as much clusters as we can until we reach
+ * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will
+ * use that value as the maximum clusters.
+ */
+static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
+					   struct buffer_head *di_bh,
+					   u32 cpos,
+					   u32 write_len,
+					   u32 *cow_start,
+					   u32 *cow_len,
+					   int *has_data)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+	int tree_height = le16_to_cpu(el->l_tree_depth), i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb = NULL;
+	struct ocfs2_extent_rec *rec;
+	int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES);
+	int leaf_clusters, rec_end = 0;
+
+	max_clusters = max_clusters < write_len ? write_len : max_clusters;
+	if (tree_height > 0) {
+		ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	*cow_len = 0;
+	*has_data = 0;
+	for (i = 0; i < le16_to_cpu(el->l_next_free_rec);) {
+		rec = &el->l_recs[i];
+		i++;
+
+		if (ocfs2_is_empty_extent(rec)) {
+			mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
+					"index %d\n", inode->i_ino, i);
+
+			continue;
+		}
+
+		if (le32_to_cpu(rec->e_cpos) +
+		    le16_to_cpu(rec->e_leaf_clusters) <= cpos)
+			continue;
+
+		if (*cow_len == 0) {
+			BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
+			*cow_start = le32_to_cpu(rec->e_cpos);
+			rec_end = le32_to_cpu(rec->e_cpos);
+		}
+
+		if (!*has_data && !(rec->e_flags & OCFS2_EXT_UNWRITTEN))
+			*has_data = 1;
+
+		/*
+		 * If we encounter a hole or a non-refcounted record,
+		 * stop the search.
+		 */
+		if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
+		    rec_end != le32_to_cpu(rec->e_cpos))
+			break;
+
+		leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
+		rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
+
+		if (*cow_len + leaf_clusters >= max_clusters) {
+			if (*cow_len == 0) {
+				/*
+				 * cpos is in a very large extent record.
+				 * So just split max_clusters from the
+				 * extent record.
+				 */
+				leaf_clusters = rec_end - cpos;
+
+				if (leaf_clusters > max_clusters)
+					*cow_start = cpos;
+				else
+					*cow_start = rec_end - max_clusters;
+			}
+			*cow_len = max_clusters;
+			break;
+		} else
+			*cow_len += leaf_clusters;
+
+		/*
+		 * If we reach the end of the extent block and don't get enough
+		 * clusters, continue with the next extent block if possible.
+		 */
+		if (i == le16_to_cpu(el->l_next_free_rec) &&
+		    eb && eb->h_next_leaf_blk) {
+			brelse(eb_bh);
+			eb_bh = NULL;
+
+			ret = ocfs2_read_extent_block(INODE_CACHE(inode),
+					       le64_to_cpu(eb->h_next_leaf_blk),
+					       &eb_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+			el = &eb->h_list;
+			i = 0;
+		}
+	}
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Prepare meta_ac, data_ac and calculate credits when we want to add some
+ * num_clusters in data_tree "et" and change the refcount for the old
+ * clusters(starting form p_cluster) in the refcount tree.
+ *
+ * Note:
+ * 1. since we may split the old tree, so we at most will need num_clusters + 2
+ *    more new leaf records.
+ * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
+ *    just give data_ac = NULL.
+ */
+static int ocfs2_lock_refcount_allocators(struct super_block *sb,
+					u32 p_cluster, u32 num_clusters,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					struct ocfs2_alloc_context **data_ac,
+					int *credits)
+{
+	int ret = 0, meta_add = 0;
+	int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < num_clusters + 2)
+		meta_add +			ocfs2_extend_meta_needed(et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
+					      num_clusters + 2);
+
+	ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &meta_add, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
+	     meta_add, num_clusters, *credits);
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
+						meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (data_ac) {
+		ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
+					     data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_duplicate_clusters_in_large_page(handle_t *handle,
+					struct ocfs2_caching_info *ci,
+					struct ocfs2_cow_context *context,
+					u32 cpos, u32 len,
+					u64 old_block, u64 new_block)
+{
+	int ret = 0;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	int i, bpc = ocfs2_clusters_to_blocks(sb, 1);
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+	int page_start = context->cow_start / cpp;
+	int cow_page, cow_len, cp_len;
+	u64 phys;
+	struct page *page;
+	void *kaddr;
+	unsigned int from, cp_from, to;
+
+	while (len) {
+		phys = new_block;
+		cow_page = cpos / cpp;
+		cow_len = cpp - cpos % cpp;
+		cp_len = 0;
+		if (cow_len > len)
+			cow_len = len;
+
+		page = context->cow_pages[cow_page - page_start];
+		cp_from = from = (cpos % cpp) << osb->s_clustersize_bits;
+		to = from + (cow_len << osb->s_clustersize_bits);
+
+		while (cp_len < cow_len) {
+			ret = ocfs2_read_blocks(ci, old_block, bpc,
+						context->bhs, 0, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			for (i = 0; i < bpc; i++)
+				memcpy(kaddr + cp_from + i * sb->s_blocksize,
+					       context->bhs[i]->b_data,
+					       sb->s_blocksize);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			for (i = 0; i < bpc; i++) {
+				brelse(context->bhs[i]);
+				context->bhs[i] = NULL;
+			}
+
+			cpos++;
+			cp_len++;
+			old_block += bpc;
+			cp_from += osb->s_clustersize;
+		}
+
+		ocfs2_map_and_dirty_page(context->inode,
+					 handle, from, to,
+					 page, 0, &phys);
+
+		len -= cow_len;
+		new_block += bpc * cow_len;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_duplicate_clusters(handle_t *handle,
+				    struct ocfs2_cow_context *context,
+				    u32 cpos, u32 old_cluster,
+				    u32 new_cluster, u32 new_len)
+{
+	int ret = 0, bh_num;
+	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	int i, j, bpc = ocfs2_clusters_to_blocks(sb, 1);
+	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+	u64 phys, new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int page_start, ppc = ocfs2_pages_per_cluster(sb);
+	int bpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+	struct page *page;
+	void *kaddr;
+	unsigned int from;
+
+	mlog(0, "old_cluster %u, new %u, len %u at offset %u\n",
old_cluster,
+	     new_cluster, new_len, cpos);
+
+	if (osb->s_clustersize_bits >= PAGE_CACHE_SHIFT) {
+		/*
+		 * Page size is less than cluster size, so we just need
+		 * to write all the pages in the new clusters.
+		 */
+		while (new_len) {
+			phys = new_block;
+			ret = ocfs2_read_blocks(ci, old_block, bpc,
+						context->bhs, 0, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			bh_num = 0;
+
+			page_start = (cpos - context->cow_start) * ppc;
+			from = cpos << osb->s_clustersize_bits;
+
+			for (i = 0; i < ppc; i++, from += PAGE_CACHE_SIZE) {
+				page = context->cow_pages[page_start + i];
+
+				kaddr = kmap_atomic(page, KM_USER0);
+				for (j = 0; j < bpp; j++, bh_num++)
+					memcpy(kaddr + j * sb->s_blocksize,
+					       context->bhs[bh_num]->b_data,
+					       sb->s_blocksize);
+				kunmap_atomic(kaddr, KM_USER0);
+
+				ocfs2_map_and_dirty_page(context->inode,
+							 handle, 0,
+							 PAGE_CACHE_SIZE,
+							 page, 0, &phys);
+			}
+
+			for (i = 0; i < bpc; i++) {
+				brelse(context->bhs[i]);
+				context->bhs[i] = NULL;
+			}
+
+			new_len--;
+			cpos++;
+			old_block += bpc;
+			new_block += bpc;
+		}
+	} else {
+		ret = ocfs2_duplicate_clusters_in_large_page(handle, ci,
+							     context,
+							     cpos, new_len,
+							     old_block,
+							     new_block);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_clear_ext_refcount(handle_t *handle,
+				    struct ocfs2_extent_tree *et,
+				    u32 cpos, u32 p_cluster, u32 len,
+				    unsigned int ext_flags,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret, index;
+	struct ocfs2_extent_rec replace_rec;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
+	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
+
+	mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
+	     (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+
+	memset(&replace_rec, 0, sizeof(replace_rec));
+	replace_rec.e_cpos = cpu_to_le32(cpos);
+	replace_rec.e_leaf_clusters = cpu_to_le16(len);
+	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
+								   p_cluster));
+	replace_rec.e_flags = ext_flags;
+	replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
+
+	path = ocfs2_new_path_from_et(et);
+	ret = ocfs2_find_path(et->et_ci, path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	index = ocfs2_search_extent_list(el, cpos);
+	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+		ocfs2_error(sb,
+			    "Inode %llu has an extent at cpos %u which can no "
+			    "longer be found.\n",
+			    (unsigned long long)ino, cpos);
+		ret = -EROFS;
+		goto out;
+	}
+
+	ret = ocfs2_split_extent(handle, et, path, index,
+				 &replace_rec, meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(path);
+	return ret;
+}
+
+static int ocfs2_replace_clusters(handle_t *handle,
+				  struct ocfs2_cow_context *context,
+				  u32 cpos, u32 old,
+				  u32 new, u32 len,
+				  unsigned int ext_flags)
+{
+	int ret;
+	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	u64 ino = ocfs2_metadata_cache_owner(ci);
+
+	mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags
%u\n",
+	     (unsigned long long)ino, cpos, old, new, len, ext_flags);
+
+	/*If the old clusters is unwritten, no need to duplicate. */
+	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+		ret = ocfs2_duplicate_clusters(handle, context, cpos,
+					       old, new, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+				       cpos, new, len, ext_flags,
+				       context->meta_ac, &context->dealloc);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
+}
+
+static int ocfs2_make_clusters_writable(struct super_block *sb,
+					struct ocfs2_cow_context *context,
+					u32 cpos, u32 p_cluster,
+					u32 num_clusters, unsigned int e_flags)
+{
+	int ret, credits =  0;
+	u32 new_bit, new_len;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	handle_t *handle;
+
+	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
+					     &context->di_et,
+					     context->ref_ci,
+					     context->ref_root_bh,
+					     &context->meta_ac,
+					     &context->data_ac, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (num_clusters) {
+		ret = __ocfs2_claim_clusters(osb, handle, context->data_ac,
+					     1, num_clusters,
+					     &new_bit, &new_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_replace_clusters(handle, context,
+					     cpos, p_cluster, new_bit,
+					     new_len, e_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		cpos += new_len;
+		p_cluster += new_len;
+		num_clusters -= new_len;
+	}
+
+	ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
+					context->ref_root_bh,
+					p_cluster, num_clusters,
+					context->meta_ac,
+					&context->dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (context->data_ac) {
+		ocfs2_free_alloc_context(context->data_ac);
+		context->data_ac = NULL;
+	}
+	if (context->meta_ac) {
+		ocfs2_free_alloc_context(context->meta_ac);
+		context->meta_ac = NULL;
+	}
+
+	return ret;
+}
+
+static int ocfs2_replace_cow(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     struct buffer_head *ref_root_bh,
+			     u32 cow_start, u32 cow_len,
+			     struct page **pages,
+			     int num_pages)
+{
+	int ret;
+	u32 p_cluster, num_clusters, start = cow_start;
+	unsigned int ext_flags;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cow_context context;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
+		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+			    "tree, but the feature bit is not set in the "
+			    "super block.", inode->i_ino);
+		return -EROFS;
+	}
+
+	memset(&context, 0, sizeof(context));
+
+	context.inode = inode;
+	context.cow_pages = pages;
+	context.num_pages = num_pages;
+	context.cow_start = cow_start;
+	context.cow_len = cow_len;
+	context.ref_ci = INODE_CACHE(inode);
+	context.ref_root_bh = ref_root_bh;
+
+	context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1),
+			      sizeof(struct buffer_head *), GFP_NOFS);
+	if (!context.bhs) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_init_dealloc_ctxt(&context.dealloc);
+	ocfs2_init_dinode_extent_tree(&context.di_et,
+				      INODE_CACHE(inode), di_bh);
+
+	while (cow_len) {
+		ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
+
+		if (cow_len < num_clusters)
+			num_clusters = cow_len;
+
+		ret = ocfs2_make_clusters_writable(inode->i_sb, &context,
+						   cow_start, p_cluster,
+						   num_clusters, ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		cow_len -= num_clusters;
+		cow_start += num_clusters;
+	}
+
+
+	/*
+	 * truncate the extent map here since no matter whether we meet with
+	 * any error during the action, we shouldn't trust cached extent map
+	 * any more.
+	 */
+	ocfs2_extent_map_trunc(inode, start);
+
+	if (ocfs2_dealloc_has_cluster(&context.dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &context.dealloc);
+	}
+
+	kfree(context.bhs);
+	return ret;
+}
+
+int ocfs2_refcount_cow(struct inode *inode,
+		       struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len)
+{
+	int ret, has_data = 0, num_pages = 0;
+	u32 cow_start = 0, cow_len = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *ref_root_bh = NULL;
+	struct page **pages = NULL;
+	loff_t start, end;
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di->i_refcount_loc);
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len,
+					      &cow_start, &cow_len, &has_data);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
+	     "cow_len %u\n", inode->i_ino,
+	     cpos, write_len, cow_start, cow_len);
+
+	BUG_ON(cow_len == 0);
+
+	if (has_data) {
+		pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb) * cow_len,
+				sizeof(struct page *), GFP_NOFS);
+		if (pages == NULL) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		start = cow_start << OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+		end = start +
+			(cow_len << OCFS2_SB(inode->i_sb)->s_clustersize_bits);
+		ret = ocfs2_grab_pages(inode, start, end, pages, &num_pages);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+					le64_to_cpu(di->i_refcount_loc),
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh,
+				cow_start, cow_len, pages, num_pages);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (pages) {
+		ocfs2_unlock_and_free_pages(pages, num_pages);
+		kfree(pages);
+	}
+	brelse(ref_root_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 0fdf726..40389b4 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -33,4 +33,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  u32 clusters,
 					  int *credits,
 					  struct ocfs2_alloc_context **meta_ac);
+int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+		       u32 cpos, u32 write_len);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 15/39] ocfs2: CoW refcount tree improvement.

During CoW, if the old extent record is refcounted, we allocate
som new clusters and do CoW. Actually we can have some improvement
here. If the old extent has refcount=1, that means now it is only
used by this file. So we don't need to allocate new clusters, just
remove the refcounted flag and it is OK. We also have to remove
it from the refcount tree while not deleting it.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c        |    2 +-
 fs/ocfs2/refcounttree.c |  107 +++++++++++++++++++++++++++++++++++------------
 fs/ocfs2/refcounttree.h |    3 +-
 3 files changed, 83 insertions(+), 29 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 365b96e..09f47f8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6847,7 +6847,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 					ocfs2_blocks_to_clusters(osb->sb,
 								 delete_blk),
 					clusters_to_del, meta_ac,
-					&tc->tc_dealloc);
+					&tc->tc_dealloc, 1);
 		else
 			status = ocfs2_truncate_log_append(osb, handle,
 							   delete_blk,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b71753b..85f09bf 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1441,7 +1441,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 				     struct buffer_head *ref_root_bh,
 				     u64 cpos, u32 len,
 				     struct ocfs2_alloc_context *meta_ac,
-				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+				     struct ocfs2_cached_dealloc_ctxt *dealloc,
+				     int delete)
 {
 	int ret = 0, index = 0;
 	struct ocfs2_refcount_rec rec;
@@ -1449,9 +1450,10 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	struct buffer_head *ref_leaf_bh = NULL;
 
-	mlog(0, "Tree owner %llu, decrease refcount start %llu, len %u\n",
+	mlog(0, "Tree owner %llu, decrease refcount start %llu, "
+	     "len %u, delete %u\n",
 	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)cpos, len);
+	     (unsigned long long)cpos, len, delete);
 
 	while (len) {
 		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
@@ -1464,6 +1466,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 
 		r_count = le32_to_cpu(rec.r_refcount);
 		BUG_ON(r_count == 0);
+		if (!delete)
+			BUG_ON(r_count > 1);
 
 		r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
 			      le32_to_cpu(rec.r_clusters)) - cpos;
@@ -1477,7 +1481,7 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 			goto out;
 		}
 
-		if (le32_to_cpu(rec.r_refcount) == 1) {
+		if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
 			ret = ocfs2_cache_cluster_dealloc(dealloc,
 					  ocfs2_clusters_to_blocks(sb, cpos),
 							  r_len);
@@ -1501,7 +1505,8 @@ out:
 int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
-			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete)
 {
 	int ret;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -1520,7 +1525,7 @@ int ocfs2_decrease_refcount(struct inode *inode, struct
buffer_head *di_bh,
 	}
 
 	ret = __ocfs2_decrease_refcount(handle, INODE_CACHE(inode), ref_root_bh,
-					cpos, len, meta_ac, dealloc);
+					cpos, len, meta_ac, dealloc, delete);
 	if (ret)
 		mlog_errno(ret);
 out:
@@ -2207,10 +2212,16 @@ static int ocfs2_make_clusters_writable(struct
super_block *sb,
 					u32 cpos, u32 p_cluster,
 					u32 num_clusters, unsigned int e_flags)
 {
-	int ret, credits =  0;
+	int ret, delete, index, credits =  0;
 	u32 new_bit, new_len;
+	unsigned int set_len;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	handle_t *handle;
+	struct buffer_head *ref_leaf_bh = NULL;
+	struct ocfs2_refcount_rec rec;
+
+	mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
+	     cpos, p_cluster, num_clusters, e_flags);
 
 	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
 					     &context->di_et,
@@ -2231,36 +2242,77 @@ static int ocfs2_make_clusters_writable(struct
super_block *sb,
 	}
 
 	while (num_clusters) {
-		ret = __ocfs2_claim_clusters(osb, handle, context->data_ac,
-					     1, num_clusters,
-					     &new_bit, &new_len);
+		ret = ocfs2_get_refcount_rec(context->ref_ci,
+					     context->ref_root_bh,
+					     p_cluster, num_clusters,
+					     &rec, &index, &ref_leaf_bh);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_commit;
+			break;
 		}
 
-		ret = ocfs2_replace_clusters(handle, context,
-					     cpos, p_cluster, new_bit,
-					     new_len, e_flags);
+		BUG_ON(!rec.r_refcount);
+		set_len = min((u64)p_cluster + num_clusters,
+			      le64_to_cpu(rec.r_cpos) +
+			      le32_to_cpu(rec.r_clusters)) - p_cluster;
+
+		/*
+		 * There are many different situation here.
+		 * 1. If refcount == 1, remove the flag and do COW no delete.
+		 * 2. If refcount > 1, allocate clusters.
+		 *    Here we may not allocate r_len once at a time, so continue
+		 *    until we reach num_clusters.
+		 */
+		if (le32_to_cpu(rec.r_refcount) == 1) {
+			delete = 0;
+			ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+						       cpos, p_cluster,
+						       set_len, e_flags,
+						       context->meta_ac,
+						       &context->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		} else {
+			delete = 1;
+
+			ret = __ocfs2_claim_clusters(osb, handle,
+						     context->data_ac,
+						     1, set_len,
+						     &new_bit, &new_len);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+
+			ret = ocfs2_replace_clusters(handle, context,
+						     cpos, p_cluster, new_bit,
+						     new_len, e_flags);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+			set_len = new_len;
+		}
+
+		ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
+						context->ref_root_bh,
+						p_cluster, set_len,
+						context->meta_ac,
+						&context->dealloc, delete);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_commit;
+			break;
 		}
 
-		cpos += new_len;
-		p_cluster += new_len;
-		num_clusters -= new_len;
+		cpos += set_len;
+		p_cluster += set_len;
+		num_clusters -= set_len;
+		brelse(ref_leaf_bh);
+		ref_leaf_bh = NULL;
 	}
 
-	ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
-					context->ref_root_bh,
-					p_cluster, num_clusters,
-					context->meta_ac,
-					&context->dealloc);
-	if (ret)
-		mlog_errno(ret);
-
-out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out:
@@ -2272,6 +2324,7 @@ out:
 		ocfs2_free_alloc_context(context->meta_ac);
 		context->meta_ac = NULL;
 	}
+	brelse(ref_leaf_bh);
 
 	return ret;
 }
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 40389b4..445af2e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -26,7 +26,8 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct
buffer_head *di_bh);
 int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
-			    struct ocfs2_cached_dealloc_ctxt *dealloc);
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    int delete);
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  struct buffer_head *di_bh,
 					  u64 phys_blkno,
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 16/39] ocfs2: Add __ocfs2_reflink.

ioctl will call __ocfs2_reflink. And it will:
1. Create a new refcount tree to the old file if it doesn't have one
   and insert all the extent records to the tree if they are not
   refcounted.
2. Insert all the extent records to the new inode's extent list.
3. Increase the r_count for all the items in the refcount tree.

Note:
This patch use ocfs2_mknod to create a new file under the destination
directory, it isn't safe. Some following patches will try to create
the file first in orphan dir, reflink the extent list and then move
the created file to the destination directory.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/namei.c        |    6 +-
 fs/ocfs2/namei.h        |    2 +
 fs/ocfs2/refcounttree.c |  379 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 383 insertions(+), 4 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 31fc13e..e38b9dc 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -217,10 +217,8 @@ static struct inode *ocfs2_get_init_inode(struct inode
*dir, int mode)
 	return inode;
 }
 
-static int ocfs2_mknod(struct inode *dir,
-		       struct dentry *dentry,
-		       int mode,
-		       dev_t dev)
+int ocfs2_mknod(struct inode *dir, struct dentry *dentry,
+		int mode, dev_t dev)
 {
 	int status = 0;
 	struct buffer_head *parent_fe_bh = NULL;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 688aef6..7f9cc46 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,5 +35,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		     struct inode *orphan_dir_inode,
 		     struct inode *inode,
 		     struct buffer_head *orphan_dir_bh);
+int ocfs2_mknod(struct inode *dir, struct dentry *dentry,
+		int mode, dev_t dev);
 
 #endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 85f09bf..2edb5d1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -32,6 +32,7 @@
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "aops.h"
+#include "namei.h"
 
 struct ocfs2_cow_context {
 	struct inode *inode;
@@ -2476,3 +2477,381 @@ out:
 	brelse(ref_root_bh);
 	return ret;
 }
+
+/*
+ * Insert a new extent into refcount tree and mark a extent rec
+ * as refcounted in the dinode tree.
+ */
+static int ocfs2_add_refcount_flag(struct inode *inode,
+				   struct ocfs2_extent_tree *di_et,
+				   struct ocfs2_caching_info *ref_ci,
+				   struct buffer_head *ref_root_bh,
+				   u32 cpos, u32 p_cluster, u32 num_clusters,
+				   struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 1, ref_blocks = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
+					       ref_ci, ref_root_bh,
+					       p_cluster, num_clusters,
+					       &ref_blocks, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "reserve new metadata %d, credits = %d\n",
+	     ref_blocks, credits);
+
+	if (ref_blocks) {
+		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+							ref_blocks, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_mark_extent_refcounted(inode, di_et, handle,
+					   cpos, num_clusters, p_cluster,
+					   meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters,
+					meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_attach_refcount_tree(struct inode *inode,
+				      struct buffer_head *fe_bh)
+{
+	int ret;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	unsigned int ext_flags;
+	loff_t size;
+	u32 cpos, num_clusters, clusters, p_cluster;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree di_et;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+		ret = ocfs2_create_refcount_tree(inode, fe_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	BUG_ON(!di->i_refcount_loc);
+	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+					le64_to_cpu(di->i_refcount_loc),
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), fe_bh);
+
+	size = i_size_read(inode);
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		cpos += num_clusters;
+		if ((ext_flags & OCFS2_EXT_REFCOUNTED) || !p_cluster)
+			continue;
+
+		ret = ocfs2_add_refcount_flag(inode, &di_et,
+					      INODE_CACHE(inode), ref_root_bh,
+					      cpos - num_clusters,
+					      p_cluster, num_clusters,
+					      &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	/*
+	 * Empty the extent map so that we may get the right extent
+	 * record from the disk.
+	 */
+	ocfs2_extent_map_trunc(inode, 0);
+	brelse(ref_root_bh);
+out:
+
+	if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(OCFS2_SB(inode->i_sb), 1);
+		ocfs2_run_deallocs(OCFS2_SB(inode->i_sb), &dealloc);
+	}
+	return ret;
+}
+
+static int ocfs2_add_refcounted_extent(struct inode *inode,
+				   struct ocfs2_extent_tree *et,
+				   struct ocfs2_caching_info *ref_ci,
+				   struct buffer_head *ref_root_bh,
+				   u32 cpos, u32 p_cluster, u32 num_clusters,
+				   unsigned int ext_flags,
+				   struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	handle_t *handle;
+	int credits = 0;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_refcount_allocators(inode->i_sb,
+					     p_cluster, num_clusters,
+					     et, ref_ci,
+					     ref_root_bh, &meta_ac,
+					     NULL, &credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_insert_extent(handle, et, cpos,
+			cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+							     p_cluster)),
+			num_clusters, ext_flags, meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters,
+					meta_ac, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_duplicate_extent_list(struct inode *s_inode,
+				struct inode *t_inode,
+				struct buffer_head *t_bh,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	u32 p_cluster, num_clusters, clusters, cpos;
+	loff_t size;
+	unsigned int ext_flags;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
+
+	size = i_size_read(s_inode);
+	clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
+					 &num_clusters, &ext_flags);
+
+		if (p_cluster) {
+			ret = ocfs2_add_refcounted_extent(t_inode, &et,
+							  INODE_CACHE(t_inode),
+							  ref_root_bh,
+							  cpos, p_cluster,
+							  num_clusters,
+							  ext_flags,
+							  dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+
+		cpos += num_clusters;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_complete_reflink(struct inode *s_inode,
+				  struct inode *t_inode,
+				  struct buffer_head *t_bh)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)t_bh->b_data;
+	loff_t size = i_size_read(s_inode);
+
+	handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&OCFS2_I(t_inode)->ip_lock);
+	OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
+	fe->i_clusters = cpu_to_le32(OCFS2_I(s_inode)->ip_clusters);
+	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
+	i_size_write(t_inode, size);
+	fe->i_size = cpu_to_le64(size);
+
+	ocfs2_journal_dirty(handle, t_bh);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
+	return ret;
+}
+
+static int ocfs2_create_reflink_node(struct inode *s_inode,
+				     struct buffer_head *s_bh,
+				     struct inode *t_inode,
+				     struct buffer_head *t_bh)
+{
+	int ret;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_extent_list *el;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+				      le64_to_cpu(di->i_refcount_loc));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(INODE_CACHE(t_inode),
+					le64_to_cpu(di->i_refcount_loc),
+					&ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+
+	el = &di->id2.i_list;
+
+	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
+					  ref_root_bh, &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_complete_reflink(s_inode, t_inode, t_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+
+	brelse(ref_root_bh);
+
+	return ret;
+}
+
+static int __ocfs2_reflink(struct dentry *old_dentry,
+			   struct buffer_head *old_bh,
+			   struct inode *dir,
+			   struct dentry *dentry)
+{
+	int ret;
+	struct inode *inode = old_dentry->d_inode;
+	struct inode *new_inode;
+	struct buffer_head *new_bh = NULL;
+
+	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n",
inode->i_ino,
+		   old_dentry->d_name.len, old_dentry->d_name.name,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	ret = ocfs2_mknod(dir, dentry, inode->i_mode, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_attach_refcount_tree(inode, old_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	new_inode = dentry->d_inode;
+
+	mutex_lock(&new_inode->i_mutex);
+	ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_inode_unlock(new_inode, 1);
+	brelse(new_bh);
+out_unlock:
+	mutex_unlock(&new_inode->i_mutex);
+out:
+	mlog_exit(ret);
+
+	return ret;
+}
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 17/39] ocfs2: Add ioctl for reflink.

The ioctl will take 2 parameters: old_path and new_path and
works like link.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/ioctl.c        |   12 ++++
 fs/ocfs2/ocfs2_fs.h     |    8 +++
 fs/ocfs2/refcounttree.c |  147 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    3 +
 4 files changed, 170 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 9fcd36d..4f189b7 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -22,6 +22,7 @@
 #include "ocfs2_fs.h"
 #include "ioctl.h"
 #include "resize.h"
+#include "refcounttree.h"
 
 #include <linux/ext2_fs.h>
 
@@ -116,6 +117,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
 	int status;
 	struct ocfs2_space_resv sr;
 	struct ocfs2_new_group_input input;
+	struct reflink_arguments args;
+	const char *old_path, *new_path;
 
 	switch (cmd) {
 	case OCFS2_IOC_GETFLAGS:
@@ -161,6 +164,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
 			return -EFAULT;
 
 		return ocfs2_group_add(inode, &input);
+	case OCFS2_IOC_REFLINK:
+		if (copy_from_user(&args, (struct reflink_arguments *)arg,
+				   sizeof(args)))
+			return -EFAULT;
+		old_path = (const char *)(unsigned long)args.old_path;
+		new_path = (const char *)(unsigned long)args.new_path;
+
+		return ocfs2_reflink(inode, old_path, new_path);
 	default:
 		return -ENOTTY;
 	}
@@ -183,6 +194,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd,
unsigned long arg)
 	case OCFS2_IOC_GROUP_EXTEND:
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
+	case OCFS2_IOC_REFLINK:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 66d0bd7..04adaf9 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -300,6 +300,14 @@ struct ocfs2_new_group_input {
 #define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
 #define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
 
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+	__u64 old_path;
+	__u64 new_path;
+};
+#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
+
+
 /*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
  */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 2edb5d1..fbdefb0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,11 @@
 #include "aops.h"
 #include "namei.h"
 
+#include <linux/security.h>
+#include <linux/quotaops.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+
 struct ocfs2_cow_context {
 	struct inode *inode;
 	struct ocfs2_extent_tree di_et;
@@ -2855,3 +2860,145 @@ out:
 
 	return ret;
 }
+
+/* copied from may_create in VFS. */
+static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
+{
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/* Most codes are copied from vfs_link. */
+static int ocfs2_vfs_reflink(struct dentry *old_dentry,
+			     struct inode *dir,
+			     struct dentry *new_dentry)
+{
+	struct buffer_head *old_bh = NULL;
+	struct inode *inode = old_dentry->d_inode;
+	int error;
+
+	if (!inode)
+		return -ENOENT;
+
+	error = ocfs2_may_create(dir, new_dentry);
+	if (error)
+		return error;
+
+	if (dir->i_sb != inode->i_sb)
+		return -EXDEV;
+
+	/*
+	 * A link to an append-only or immutable file cannot be created.
+	 */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return -EPERM;
+	if (S_ISDIR(inode->i_mode))
+		return -EPERM;
+
+	if (!OCFS2_I(inode)->ip_clusters) {
+		mlog(ML_ERROR, "reflink doesn't work with 0 cluster files.\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&inode->i_mutex);
+	error = ocfs2_inode_lock(inode, &old_bh, 1);
+	if (error) {
+		mlog_errno(error);
+		goto out_unlock;
+	}
+
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	vfs_dq_init(dir);
+	error = __ocfs2_reflink(old_dentry, old_bh, dir, new_dentry);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 1);
+	brelse(old_bh);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+
+	return error;
+}
+
+/* copied from user_path_parent. */
+static int ocfs2_user_path_parent(const char __user *path,
+				  struct nameidata *nd, char **name)
+{
+	char *s = getname(path);
+	int error;
+
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	error = path_lookup(s, LOOKUP_PARENT, nd);
+	if (error)
+		putname(s);
+	else
+		*name = s;
+
+	return error;
+}
+
+/*
+ * Most codes are copied from sys_linkat.
+ */
+int ocfs2_reflink(struct inode *inode,
+		  const char __user *oldname,
+		  const char __user *newname)
+{
+	struct dentry *new_dentry;
+	struct nameidata nd;
+	struct path old_path;
+	int error;
+	char *to = NULL;
+
+	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
+	if (error) {
+		mlog_errno(error);
+		return error;
+	}
+
+	error = ocfs2_user_path_parent(newname, &nd, &to);
+	if (error) {
+		mlog_errno(error);
+		goto out;
+	}
+
+	error = -EXDEV;
+	if (old_path.mnt != nd.path.mnt)
+		goto out_release;
+	new_dentry = lookup_create(&nd, 0);
+	error = PTR_ERR(new_dentry);
+	if (IS_ERR(new_dentry)) {
+		mlog_errno(error);
+		goto out_unlock;
+	}
+
+	error = mnt_want_write(nd.path.mnt);
+	if (error) {
+		mlog_errno(error);
+		goto out_dput;
+	}
+
+	error = ocfs2_vfs_reflink(old_path.dentry,
+				  nd.path.dentry->d_inode,
+				  new_dentry);
+	mnt_drop_write(nd.path.mnt);
+out_dput:
+	dput(new_dentry);
+out_unlock:
+	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out_release:
+	path_put(&nd.path);
+	putname(to);
+out:
+	path_put(&old_path);
+
+	return error;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 445af2e..7f44093 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -36,4 +36,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
 		       u32 cpos, u32 write_len);
+int ocfs2_reflink(struct inode *inode,
+		  const char __user *oldname,
+		  const char __user *newname);
 #endif /* OCFS2_REFCOUNTTREE_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 18/39] ocfs2: Use proper parameter for some inode operation.

In order to make the original function more suitable for reflink,
we modify the following inode operations. Both are tiny.

1. ocfs2_mknod_locked only use dentry for mlog, so move it to
   the caller so that reflink can use it without dentry.
2. ocfs2_prepare_orphan_dir only want inode to get its ip_blkno.
   So use ip_blkno instead.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/namei.c |   32 +++++++++++++++++---------------
 1 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e38b9dc..04ffbfe 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -69,7 +69,6 @@
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct inode *inode,
-			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
@@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
-				    struct inode *inode,
+				    u64 blkno,
 				    char *name,
 				    struct ocfs2_dir_lookup_result *lookup);
 
@@ -356,8 +355,12 @@ int ocfs2_mknod(struct inode *dir, struct dentry *dentry,
 	}
 	did_quota_inode = 1;
 
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
+		   dentry->d_name.name);
+
 	/* do the real work now. */
-	status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
+	status = ocfs2_mknod_locked(osb, dir, inode, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
 				    inode_ac);
 	if (status < 0) {
@@ -464,7 +467,6 @@ leave:
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct inode *inode,
-			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
@@ -478,10 +480,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	u16 suballoc_bit;
 	u16 feat;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
-		   dentry->d_name.name);
-
 	*new_fe_bh = NULL;
 
 	status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
@@ -849,7 +847,8 @@ static int ocfs2_unlink(struct inode *dir,
 	}
 
 	if (inode_is_unlinkable(inode)) {
-		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
+		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+						  OCFS2_I(inode)->ip_blkno,
 						  orphan_name, &orphan_insert);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1240,9 +1239,8 @@ static int ocfs2_rename(struct inode *old_dir,
 
 		if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
 			status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
-							  new_inode,
-							  orphan_name,
-							  &orphan_insert);
+						OCFS2_I(new_inode)->ip_blkno,
+						orphan_name, &orphan_insert);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -1698,7 +1696,11 @@ static int ocfs2_symlink(struct inode *dir,
 	}
 	did_quota_inode = 1;
 
-	status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
+		   inode->i_mode, dentry->d_name.len,
+		   dentry->d_name.name);
+
+	status = ocfs2_mknod_locked(osb, dir, inode,
 				    0, &new_fe_bh, parent_fe_bh, handle,
 				    inode_ac);
 	if (status < 0) {
@@ -1848,7 +1850,7 @@ bail:
 
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
-				    struct inode *inode,
+				    u64 blkno,
 				    char *name,
 				    struct ocfs2_dir_lookup_result *lookup)
 {
@@ -1856,7 +1858,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super
*osb,
 	struct buffer_head *orphan_dir_bh = NULL;
 	int status = 0;
 
-	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+	status = ocfs2_blkno_stringify(blkno, name);
 	if (status < 0) {
 		mlog_errno(status);
 		return status;
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 19/39] ocfs2: Create reflinked file in orphan dir.

reflink is a very complicated process, so it can't be integrated
into one transaction. So if the system panic in the operation, we
may leave a unfinished inode in the destication directory.

This patch try to create an inode in orphan_dir first, reflink it
to the src file and then move it to the destication file in the end.
So we are not afraid of any corruption during the reflink.

In the mean time, we never use ocfs2_mknod, so make it static again.

Note:
fsck.ocfs2 should work for us to remove the unfinished file in the
orphan_dir.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/namei.c        |  260 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/namei.h        |    9 +-
 fs/ocfs2/refcounttree.c |   35 ++++---
 3 files changed, 283 insertions(+), 21 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 04ffbfe..3baeac3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -216,8 +216,8 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir,
int mode)
 	return inode;
 }
 
-int ocfs2_mknod(struct inode *dir, struct dentry *dentry,
-		int mode, dev_t dev)
+static int ocfs2_mknod(struct inode *dir, struct dentry *dentry,
+		       int mode, dev_t dev)
 {
 	int status = 0;
 	struct buffer_head *parent_fe_bh = NULL;
@@ -2040,6 +2040,262 @@ leave:
 	return status;
 }
 
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode)
+{
+	int status, did_quota_inode = 0;
+	struct inode *inode = NULL;
+	struct inode *orphan_dir = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *fe = NULL;
+	handle_t *handle = NULL;
+	char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
+	struct buffer_head *parent_fe_bh = NULL;
+	struct buffer_head *new_fe_bh = NULL;
+	struct ocfs2_alloc_context *inode_ac = NULL;
+	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	/*
+	 * We give the orphan dir the root blkno to fake an orphan name,
+	 * and allocate enough space for our insertion.
+	 */
+	status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
+					  osb->root_blkno,
+					  orphan_name, &orphan_insert);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* reserve an inode spot */
+	status = ocfs2_reserve_new_inode(osb, &inode_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota_inode = 1;
+
+	/* do the real work now. */
+	status = ocfs2_mknod_locked(osb, dir, inode,
+				    0, &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	fe = (struct ocfs2_dinode *)new_fe_bh->b_data;
+	status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+				  &orphan_insert, orphan_dir);
+	if (status < 0)
+		mlog_errno(status);
+
+leave:
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+
+	if (orphan_dir) {
+		/* This was locked for us in ocfs2_prepare_orphan_dir() */
+		ocfs2_inode_unlock(orphan_dir, 1);
+		mutex_unlock(&orphan_dir->i_mutex);
+		iput(orphan_dir);
+	}
+
+	if (status == -ENOSPC)
+		mlog(0, "Disk is full\n");
+
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
+		iput(inode);
+	}
+
+	if (inode_ac)
+		ocfs2_free_alloc_context(inode_ac);
+
+	brelse(new_fe_bh);
+
+	if (!status)
+		*new_inode = inode;
+
+	ocfs2_inode_unlock(dir, 1);
+	return status;
+}
+
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *inode,
+				   struct dentry *dentry)
+{
+	int status = 0;
+	struct buffer_head *parent_fe_bh = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_dinode *dirfe, *fe;
+	struct inode *orphan_dir_inode = NULL;
+	struct buffer_head *orphan_dir_bh = NULL;
+	struct buffer_head *fe_bh = NULL;
+	struct ocfs2_dir_lookup_result lookup = { NULL, };
+
+	mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
+		   dentry->d_name.len, dentry->d_name.name);
+
+	status = ocfs2_inode_lock(dir, &parent_fe_bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+
+	dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	if (!dirfe->i_links_count) {
+		/* can't make a file in a deleted directory. */
+		status = -ENOENT;
+		goto leave;
+	}
+
+	status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
+					   dentry->d_name.len);
+	if (status)
+		goto leave;
+
+	/* get a spot inside the dir. */
+	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
+					      dentry->d_name.name,
+					      dentry->d_name.len, &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+						       ORPHAN_DIR_SYSTEM_INODE,
+						       osb->slot_num);
+	if (!orphan_dir_inode) {
+		status = -EEXIST;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	mutex_lock(&orphan_dir_inode->i_mutex);
+
+	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		mutex_unlock(&orphan_dir_inode->i_mutex);
+		iput(orphan_dir_inode);
+		goto leave;
+	}
+
+	status = ocfs2_read_inode_block(inode, &fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto orphan_unlock;
+	}
+
+	status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+				  orphan_dir_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	fe = (struct ocfs2_dinode *)fe_bh->b_data;
+	le32_add_cpu(&fe->i_flags, -OCFS2_ORPHANED_FL);
+	fe->i_orphaned_slot = 0;
+	ocfs2_journal_dirty(handle, fe_bh);
+
+	status = ocfs2_add_entry(handle, dentry, inode,
+				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+				 &lookup);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+
+	status = ocfs2_dentry_attach_lock(dentry, inode,
+					  OCFS2_I(dir)->ip_blkno);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	insert_inode_hash(inode);
+	dentry->d_op = &ocfs2_dentry_ops;
+	d_instantiate(dentry, inode);
+	status = 0;
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+orphan_unlock:
+	ocfs2_inode_unlock(orphan_dir_inode, 1);
+	mutex_unlock(&orphan_dir_inode->i_mutex);
+	iput(orphan_dir_inode);
+leave:
+
+	ocfs2_inode_unlock(dir, 1);
+
+	brelse(parent_fe_bh);
+	brelse(orphan_dir_bh);
+
+	mlog_exit(status);
+
+	return status;
+}
+
 const struct inode_operations ocfs2_dir_iops = {
 	.create		= ocfs2_create,
 	.lookup		= ocfs2_lookup,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 7f9cc46..1db41a2 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,7 +35,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		     struct inode *orphan_dir_inode,
 		     struct inode *inode,
 		     struct buffer_head *orphan_dir_bh);
-int ocfs2_mknod(struct inode *dir, struct dentry *dentry,
-		int mode, dev_t dev);
-
+int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 int mode,
+				 struct inode **new_inode);
+int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct inode *new_inode,
+				   struct dentry *new_dentry);
 #endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index fbdefb0..8927c33 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2814,32 +2814,18 @@ out:
 
 static int __ocfs2_reflink(struct dentry *old_dentry,
 			   struct buffer_head *old_bh,
-			   struct inode *dir,
-			   struct dentry *dentry)
+			   struct inode *new_inode)
 {
 	int ret;
 	struct inode *inode = old_dentry->d_inode;
-	struct inode *new_inode;
 	struct buffer_head *new_bh = NULL;
 
-	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n",
inode->i_ino,
-		   old_dentry->d_name.len, old_dentry->d_name.name,
-		   dentry->d_name.len, dentry->d_name.name);
-
-	ret = ocfs2_mknod(dir, dentry, inode->i_mode, 0);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ret = ocfs2_attach_refcount_tree(inode, old_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	new_inode = dentry->d_inode;
-
 	mutex_lock(&new_inode->i_mutex);
 	ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
 	if (ret) {
@@ -2878,6 +2864,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry,
 {
 	struct buffer_head *old_bh = NULL;
 	struct inode *inode = old_dentry->d_inode;
+	struct inode *new_orphan_inode;
 	int error;
 
 	if (!inode)
@@ -2903,6 +2890,13 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry,
 		return -EINVAL;
 	}
 
+	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+					     &new_orphan_inode);
+	if (error) {
+		mlog_errno(error);
+		goto out_unlock;
+	}
+
 	mutex_lock(&inode->i_mutex);
 	error = ocfs2_inode_lock(inode, &old_bh, 1);
 	if (error) {
@@ -2912,14 +2906,23 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry,
 
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	vfs_dq_init(dir);
-	error = __ocfs2_reflink(old_dentry, old_bh, dir, new_dentry);
+	error = __ocfs2_reflink(old_dentry, old_bh, new_orphan_inode);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	if (error)
+		mlog_errno(error);
 
 	ocfs2_inode_unlock(inode, 1);
 	brelse(old_bh);
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 
+	if (!error) {
+		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+						       new_dentry);
+		if (error)
+			mlog_errno(error);
+	}
+
 	return error;
 }
 
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 20/39] ocfs2: Abstract caching info checkpoint.

In meta downconvert, we need to checkpoint the metadata in an inode.
For refcount tree, we also need it. So abstract the process out.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/dlmglue.c |   18 +++++++++++++-----
 1 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index be00c74..030e66f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3388,11 +3388,11 @@ out:
 	return UNBLOCK_CONTINUE;
 }
 
-static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
-					int new_level)
+static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
+				 struct ocfs2_lock_res *lockres,
+				 int new_level)
 {
-	struct inode *inode = ocfs2_lock_res_inode(lockres);
-	int checkpointed = ocfs2_ci_fully_checkpointed(INODE_CACHE(inode));
+	int checkpointed = ocfs2_ci_fully_checkpointed(ci);
 
 	BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
 	BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
@@ -3400,10 +3400,18 @@ static int ocfs2_check_meta_downconvert(struct
ocfs2_lock_res *lockres,
 	if (checkpointed)
 		return 1;
 
-	ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb));
+	ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
 	return 0;
 }
 
+static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
+					int new_level)
+{
+	struct inode *inode = ocfs2_lock_res_inode(lockres);
+
+	return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
+}
+
 static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
 {
 	struct inode *inode = ocfs2_lock_res_inode(lockres);
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 21/39] ocfs2: Add new refcount tree lock resource.

refcount tree lock resource is used to protect refcount
tree read/write among multiple nodes.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/dlmglue.c      |   68 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmglue.h      |    4 +++
 fs/ocfs2/ocfs2_lockid.h |    5 +++
 fs/ocfs2/refcounttree.h |   14 +++++++++
 4 files changed, 91 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 030e66f..f8d0d30 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
 #include "super.h"
 #include "uptodate.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -107,6 +108,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super
*osb,
 
 static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level);
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking);
+
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level,
__PRETTY_FUNCTION__, __LINE__, __lockres)
 
 /* This aids in debugging situations where a bad LVB might be involved. */
@@ -271,6 +277,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
 	.flags		= LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
+	.check_downconvert = ocfs2_check_refcount_downconvert,
+	.downconvert_worker = ocfs2_refcount_convert_worker,
+	.flags		= 0,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -663,6 +675,16 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res
*lockres,
 				   info);
 }
 
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
+			      0, lockres->l_name);
+	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
+				   &ocfs2_refcount_block_lops, osb);
+}
+
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
 	mlog_entry_void();
@@ -3541,6 +3563,24 @@ static int ocfs2_dentry_convert_worker(struct
ocfs2_lock_res *lockres,
 	return UNBLOCK_CONTINUE_POST;
 }
 
+static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
+					    int new_level)
+{
+	struct ocfs2_refcount_tree *tree = OCFS2_REF_ITEM(lockres);
+
+	return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
+}
+
+static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
+					 int blocking)
+{
+	struct ocfs2_refcount_tree *tree = OCFS2_REF_ITEM(lockres);
+
+	ocfs2_metadata_cache_purge(&tree->rf_ci);
+
+	return UNBLOCK_CONTINUE;
+}
+
 static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
 {
 	struct ocfs2_qinfo_lvb *lvb;
@@ -3652,6 +3692,34 @@ bail:
 	return status;
 }
 
+int ocfs2_refcount_lock(struct ocfs2_lock_res *lockres, int ex)
+{
+	int status;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+	if (ocfs2_is_hard_readonly(osb))
+		return -EROFS;
+
+	if (ocfs2_mount_local(osb))
+		return 0;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+	return status;
+}
+
+void ocfs2_refcount_unlock(struct ocfs2_lock_res *lockres, int ex)
+{
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	struct ocfs2_super *osb = lockres->l_priv;
+
+	if (!ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+}
+
 /*
  * This is the filesystem locking protocol.  It provides the lock handling
  * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd572..b2700d2 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -85,6 +85,8 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 struct ocfs2_mem_dqinfo;
 void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
                                struct ocfs2_mem_dqinfo *info);
+void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
+				  struct ocfs2_super *osb, u64 ref_blkno);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -123,6 +125,8 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
 int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_refcount_lock(struct ocfs2_lock_res *lockres, int ex);
+void ocfs2_refcount_unlock(struct ocfs2_lock_res *lockres, int ex);
 
 
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87..f63e56f 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_FLOCK,
 	OCFS2_LOCK_TYPE_QINFO,
 	OCFS2_LOCK_TYPE_NFS_SYNC,
+	OCFS2_LOCK_TYPE_REFCOUNT,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type
type)
 		case OCFS2_LOCK_TYPE_NFS_SYNC:
 			c = 'Y';
 			break;
+		case OCFS2_LOCK_TYPE_REFCOUNT:
+			c = 'T';
+			break;
 		default:
 			c = '\0';
 	}
@@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+	[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 7f44093..428bd6e 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -17,6 +17,20 @@
 #ifndef OCFS2_REFCOUNTTREE_H
 #define OCFS2_REFCOUNTTREE_H
 
+struct ocfs2_refcount_tree {
+	struct rb_node rf_node;
+	u64 rf_blkno;
+	struct rw_semaphore rf_sem;
+	struct ocfs2_lock_res rf_lockres;
+	struct ocfs2_caching_info rf_ci;
+};
+
+static inline struct ocfs2_refcount_tree *
+OCFS2_REF_ITEM(struct ocfs2_lock_res *res)
+{
+	return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
+}
+
 int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
 int ocfs2_set_refcount_tree(struct inode *inode,
 			    struct buffer_head *di_bh,
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 22/39] ocfs2: Add refcount tree lock mechanism.

Refcount tree lock is only related to a block number. So
create a rb-tree for it. And the tree root is stored in
ocfs2_super.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/ocfs2.h        |    3 +
 fs/ocfs2/refcounttree.c |  187 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    7 ++
 fs/ocfs2/super.c        |    5 +
 4 files changed, 202 insertions(+), 0 deletions(-)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 62bee1e..3f62291 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -380,6 +380,9 @@ struct ocfs2_super
 
 	/* the group we used to allocate inodes. */
 	u64				osb_inode_alloc_group;
+
+	/* rb tree root for refcount lock. */
+	struct rb_root	osb_rf_lock_tree;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8927c33..ec9ee66 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3005,3 +3005,190 @@ out:
 
 	return error;
 }
+
+static struct ocfs2_refcount_tree *
+ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
+{
+	struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tree = NULL;
+
+	while (n) {
+		tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
+
+		if (blkno < tree->rf_blkno)
+			n = n->rb_left;
+		else if (blkno > tree->rf_blkno)
+			n = n->rb_right;
+		else
+			return tree;
+	}
+
+	return NULL;
+}
+
+/* osb_lock is already locked. */
+static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
+					    struct ocfs2_refcount_tree *new)
+{
+	u64 rf_blkno = new->rf_blkno;
+	struct rb_node *parent = NULL;
+	struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
+	struct ocfs2_refcount_tree *tmp;
+
+	while (*p) {
+		parent = *p;
+
+		tmp = rb_entry(parent, struct ocfs2_refcount_tree,
+			       rf_node);
+
+		if (rf_blkno < tmp->rf_blkno)
+			p = &(*p)->rb_left;
+		else if (rf_blkno > tmp->rf_blkno)
+			p = &(*p)->rb_right;
+		else {
+			/* This should never happen! */
+			mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
+			     (unsigned long long)rf_blkno);
+			BUG();
+		}
+	}
+
+	rb_link_node(&new->rf_node, parent, p);
+	rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
+}
+
+static void ocfs2_free_refcount_tree(struct ocfs2_super *osb,
+				     struct ocfs2_refcount_tree *tree)
+{
+	ocfs2_simple_drop_lockres(osb, &tree->rf_lockres);
+	ocfs2_lock_res_free(&tree->rf_lockres);
+	kfree(tree);
+}
+
+static void ocfs2_delete_refcount_tree(struct ocfs2_super *osb,
+				       struct ocfs2_refcount_tree *tree)
+{
+	spin_lock(&osb->osb_lock);
+	rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
+	spin_unlock(&osb->osb_lock);
+
+	mlog(0, "delete refcount tree %llu\n",
+	     (unsigned long long)tree->rf_blkno);
+	ocfs2_free_refcount_tree(osb, tree);
+}
+
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+				   struct ocfs2_refcount_tree **ret_tree)
+{
+	int ret = 0;
+	struct ocfs2_refcount_tree *tree, *new = NULL;
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	spin_unlock(&osb->osb_lock);
+
+	new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
+	if (!new) {
+		ret = -ENOMEM;
+		return ret;
+	}
+
+	new->rf_blkno = rf_blkno;
+	init_rwsem(&new->rf_sem);
+	ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, rf_blkno);
+
+	spin_lock(&osb->osb_lock);
+	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (tree)
+		goto out;
+
+	ocfs2_insert_refcount_tree(osb, new);
+
+	tree = new;
+	new = NULL;
+
+out:
+	if (new)
+		ocfs2_free_refcount_tree(osb, new);
+
+	*ret_tree = tree;
+
+	spin_unlock(&osb->osb_lock);
+
+	return ret;
+}
+
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really need it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **ret_tree,
+			     struct buffer_head **ref_bh)
+{
+	int ret;
+	struct ocfs2_refcount_tree *tree = NULL;
+
+	ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_refcount_lock(&tree->rf_lockres, rw);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rw)
+		down_write(&tree->rf_sem);
+	else
+		down_read(&tree->rf_sem);
+
+	if (ref_bh) {
+		ret = ocfs2_read_refcount_block(&tree->rf_ci,
+						ref_blkno, ref_bh);
+		if (ret) {
+			mlog_errno(ret);
+			ocfs2_unlock_refcount_tree(osb, tree, rw);
+			goto out;
+		}
+	}
+
+	*ret_tree = tree;
+out:
+	return ret;
+}
+
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree, int rw)
+{
+	ocfs2_refcount_unlock(&tree->rf_lockres, rw);
+
+	if (rw)
+		up_write(&tree->rf_sem);
+	else
+		up_read(&tree->rf_sem);
+}
+
+void ocfs2_purge_refcount_tree(struct ocfs2_super *osb)
+{
+	struct rb_node *node;
+	struct ocfs2_refcount_tree *tree;
+	struct rb_root *root = &osb->osb_rf_lock_tree;
+
+	while ((node = rb_last(root)) != NULL) {
+		tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
+
+		mlog(0, "Purge tree %llu\n",
+		     (unsigned long long) tree->rf_blkno);
+
+		rb_erase(&tree->rf_node, root);
+		ocfs2_free_refcount_tree(osb, tree);
+	}
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 428bd6e..5590127 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -53,4 +53,11 @@ int ocfs2_refcount_cow(struct inode *inode, struct
buffer_head *di_bh,
 int ocfs2_reflink(struct inode *inode,
 		  const char __user *oldname,
 		  const char __user *newname);
+void ocfs2_purge_refcount_tree(struct ocfs2_super *osb);
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **tree,
+			     struct buffer_head **ref_bh);
+void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
+				struct ocfs2_refcount_tree *tree,
+				int rw);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 36ff979..675c115 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,6 +68,7 @@
 #include "ver.h"
 #include "xattr.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -1808,6 +1809,8 @@ static void ocfs2_dismount_volume(struct super_block *sb,
int mnt_err)
 
 	ocfs2_sync_blockdev(sb);
 
+	ocfs2_purge_refcount_tree(osb);
+
 	/* No cluster connection means we've failed during mount, so skip
 	 * all the steps which depended on that to complete. */
 	if (osb->cconn) {
@@ -2007,6 +2010,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
+	osb->osb_rf_lock_tree = RB_ROOT;
+
 	osb->s_feature_compat  	
le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
 	osb->s_feature_ro_compat -- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 23/39] ocfs2: lock refcount tree if needed.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c        |   17 +++++++++++-
 fs/ocfs2/refcounttree.c |   68 +++++++++++++++++++++++++++++------------------
 2 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 09f47f8..013e600 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7283,6 +7283,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
 	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	mlog_entry_void();
 
@@ -7294,11 +7295,21 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	if (!path) {
 		status = -ENOMEM;
 		mlog_errno(status);
-		goto bail;
+		goto free;
 	}
 
 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+		status = ocfs2_lock_refcount_tree(osb,
+						le64_to_cpu(di->i_refcount_loc),
+						1, &ref_tree, NULL);
+		if (status) {
+			mlog_errno(status);
+			goto free;
+		}
+	}
+
 start:
 	/*
 	 * Check that we still have allocation to delete.
@@ -7446,8 +7457,12 @@ bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+
 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
 
+free:
 	ocfs2_free_path(path);
 
 	/* This will drop the ext_alloc cluster lock for us */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ec9ee66..56c8c52 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -54,6 +54,9 @@ struct ocfs2_cow_context {
 	u32 cow_len;
 };
 
+static void ocfs2_delete_refcount_tree(struct ocfs2_super *osb,
+				       struct ocfs2_refcount_tree *tree);
+
 static int ocfs2_validate_refcount_block(struct super_block *sb,
 					 struct buffer_head *bh)
 {
@@ -232,15 +235,16 @@ int ocfs2_set_refcount_tree(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *ref_tree;
 
 	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
 	BUG_ON(di->i_refcount_loc);
 
-	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
-					refcount_loc, &ref_root_bh);
+	ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
+				       &ref_tree, &ref_root_bh);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		return ret;
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
@@ -275,9 +279,11 @@ int ocfs2_set_refcount_tree(struct inode *inode,
 	di->i_refcount_loc = cpu_to_le64(refcount_loc);
 	spin_unlock(&oi->ip_lock);
 	ocfs2_journal_dirty(handle, di_bh);
+
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
 
 	return ret;
@@ -285,7 +291,7 @@ out:
 
 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
 {
-	int ret;
+	int ret, delete_tree = 0;
 	handle_t *handle = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -294,19 +300,19 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct
buffer_head *di_bh)
 	struct inode *alloc_inode = NULL;
 	struct buffer_head *alloc_bh = NULL;
 	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
 	int credits = OCFS2_INODE_UPDATE_CREDITS + 1;
-	u64 blk = 0, bg_blkno = 0;
+	u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
 	u16 bit = 0;
 
 	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
 		return 0;
 
-	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
-					le64_to_cpu(di->i_refcount_loc),
-					&blk_bh);
+	BUG_ON(!ref_blkno);
+	ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		return ret;
 	}
 
 	rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
@@ -371,6 +377,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct
buffer_head *di_bh)
 	ocfs2_journal_dirty(handle, blk_bh);
 
 	if (!rb->rf_count) {
+		delete_tree = 1;
 		ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
 					       alloc_bh, bit, bg_blkno, 1);
 		if (ret)
@@ -390,6 +397,9 @@ out_mutex:
 		iput(alloc_inode);
 	}
 out:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	if (delete_tree)
+		ocfs2_delete_refcount_tree(osb, ref_tree);
 	brelse(blk_bh);
 
 	return ret;
@@ -2422,9 +2432,11 @@ int ocfs2_refcount_cow(struct inode *inode,
 	int ret, has_data = 0, num_pages = 0;
 	u32 cow_start = 0, cow_len = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct buffer_head *ref_root_bh = NULL;
 	struct page **pages = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
 	loff_t start, end;
 
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
@@ -2434,7 +2446,7 @@ int ocfs2_refcount_cow(struct inode *inode,
 					      &cow_start, &cow_len, &has_data);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		return ret;
 	}
 	mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
 	     "cow_len %u\n", inode->i_ino,
@@ -2461,9 +2473,8 @@ int ocfs2_refcount_cow(struct inode *inode,
 		}
 	}
 
-	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
-					le64_to_cpu(di->i_refcount_loc),
-					&ref_root_bh);
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2474,12 +2485,13 @@ int ocfs2_refcount_cow(struct inode *inode,
 	if (ret)
 		mlog_errno(ret);
 
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
 out:
 	if (pages) {
 		ocfs2_unlock_and_free_pages(pages, num_pages);
 		kfree(pages);
 	}
-	brelse(ref_root_bh);
 	return ret;
 }
 
@@ -2557,6 +2569,8 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_refcount_tree *ref_tree;
 	unsigned int ext_flags;
 	loff_t size;
 	u32 cpos, num_clusters, clusters, p_cluster;
@@ -2574,9 +2588,9 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 	}
 
 	BUG_ON(!di->i_refcount_loc);
-	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
-					le64_to_cpu(di->i_refcount_loc),
-					&ref_root_bh);
+	ret = ocfs2_lock_refcount_tree(osb,
+				       le64_to_cpu(di->i_refcount_loc), 1,
+				       &ref_tree, &ref_root_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2612,13 +2626,14 @@ static int ocfs2_attach_refcount_tree(struct inode
*inode,
 	 * record from the disk.
 	 */
 	ocfs2_extent_map_trunc(inode, 0);
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
-out:
 
 	if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
-		ocfs2_schedule_truncate_log_flush(OCFS2_SB(inode->i_sb), 1);
-		ocfs2_run_deallocs(OCFS2_SB(inode->i_sb), &dealloc);
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
 	}
+out:
 	return ret;
 }
 
@@ -2768,6 +2783,7 @@ static int ocfs2_create_reflink_node(struct inode
*s_inode,
 	struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
 	struct ocfs2_refcount_block *rb;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree;
 	struct ocfs2_extent_list *el;
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
@@ -2779,9 +2795,8 @@ static int ocfs2_create_reflink_node(struct inode
*s_inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_refcount_block(INODE_CACHE(t_inode),
-					le64_to_cpu(di->i_refcount_loc),
-					&ref_root_bh);
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2794,21 +2809,22 @@ static int ocfs2_create_reflink_node(struct inode
*s_inode,
 					  ref_root_bh, &dealloc);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out_unlock_refcount;
 	}
 
 	ret = ocfs2_complete_reflink(s_inode, t_inode, t_bh);
 	if (ret)
 		mlog_errno(ret);
 
+out_unlock_refcount:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
 out:
 	if (ocfs2_dealloc_has_cluster(&dealloc)) {
 		ocfs2_schedule_truncate_log_flush(osb, 1);
 		ocfs2_run_deallocs(osb, &dealloc);
 	}
 
-	brelse(ref_root_bh);
-
 	return ret;
 }
 
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 24/39] ocfs2: Add caching info for refcount tree.

refcount tree should use its own caching info so that when
we downconvert the refcount tree lock, we can drop all the
cached buffer head.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/refcounttree.c |  124 +++++++++++++++++++++++++++++++++++++++++-----
 fs/ocfs2/refcounttree.h |   12 +++++
 2 files changed, 122 insertions(+), 14 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 56c8c52..4ef6bd2 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -54,8 +54,11 @@ struct ocfs2_cow_context {
 	u32 cow_len;
 };
 
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops;
 static void ocfs2_delete_refcount_tree(struct ocfs2_super *osb,
 				       struct ocfs2_refcount_tree *tree);
+static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
+				   struct ocfs2_refcount_tree **ret_tree);
 
 static int ocfs2_validate_refcount_block(struct super_block *sb,
 					 struct buffer_head *bh)
@@ -142,6 +145,7 @@ int ocfs2_create_refcount_tree(struct inode *inode, struct
buffer_head *di_bh)
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_tree *tree;
 	u16 suballoc_bit_start;
 	u32 num_got;
 	u64 first_blkno;
@@ -179,10 +183,16 @@ int ocfs2_create_refcount_tree(struct inode *inode, struct
buffer_head *di_bh)
 		goto out_commit;
 	}
 
+	ret = ocfs2_get_refcount_tree(osb, first_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
 	new_bh = sb_getblk(inode->i_sb, first_blkno);
-	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+	ocfs2_set_new_buffer_uptodate(&tree->rf_ci, new_bh);
 
-	ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), new_bh,
+	ret = ocfs2_journal_access_rb(handle, &tree->rf_ci, new_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
@@ -261,7 +271,7 @@ int ocfs2_set_refcount_tree(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), ref_root_bh,
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -280,6 +290,7 @@ int ocfs2_set_refcount_tree(struct inode *inode,
 	spin_unlock(&oi->ip_lock);
 	ocfs2_journal_dirty(handle, di_bh);
 
+	ocfs2_set_ci_lock_trans(osb->journal, &ref_tree->rf_ci);
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
@@ -352,6 +363,8 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct
buffer_head *di_bh)
 		goto out_unlock;
 	}
 
+	ocfs2_set_ci_lock_trans(osb->journal, &ref_tree->rf_ci);
+
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
@@ -359,7 +372,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct
buffer_head *di_bh)
 		goto out_commit;
 	}
 
-	ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), blk_bh,
+	ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -1333,6 +1346,8 @@ static int __ocfs2_increase_refcount(handle_t *handle,
 	}
 
 out:
+	ocfs2_set_ci_lock_trans(
+		OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal, ci);
 	brelse(ref_leaf_bh);
 	return ret;
 }
@@ -1518,6 +1533,7 @@ out:
 	return ret;
 }
 
+/* Caller must hold refcount tree lock. */
 int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
@@ -1528,11 +1544,19 @@ int ocfs2_decrease_refcount(struct inode *inode, struct
buffer_head *di_bh,
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_refcount_tree *tree;
 
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 	BUG_ON(!di->i_refcount_loc);
 
-	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+				      le64_to_cpu(di->i_refcount_loc), &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci,
 					le64_to_cpu(di->i_refcount_loc),
 					&ref_root_bh);
 	if (ret) {
@@ -1540,7 +1564,7 @@ int ocfs2_decrease_refcount(struct inode *inode, struct
buffer_head *di_bh,
 		goto out;
 	}
 
-	ret = __ocfs2_decrease_refcount(handle, INODE_CACHE(inode), ref_root_bh,
+	ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
 					cpos, len, meta_ac, dealloc, delete);
 	if (ret)
 		mlog_errno(ret);
@@ -1725,6 +1749,8 @@ out:
  * continguous also, so that we can get the number easily.
  * As for meta_ac, we will at most add split 2 refcount record and
  * 2 more refcount block, so just check it in a rough way.
+ *
+ * Caller must hold refcount tree lock.
  */
 int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  struct buffer_head *di_bh,
@@ -1737,6 +1763,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode
*inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *tree;
 	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
@@ -1750,7 +1777,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode
*inode,
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 	BUG_ON(!di->i_refcount_loc);
 
-	ret = ocfs2_read_refcount_block(INODE_CACHE(inode),
+	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
+				      le64_to_cpu(di->i_refcount_loc), &tree);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_refcount_block(&tree->rf_ci,
 					le64_to_cpu(di->i_refcount_loc),
 					&ref_root_bh);
 	if (ret) {
@@ -1759,7 +1793,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode
*inode,
 	}
 
 	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
-					       INODE_CACHE(inode),
+					       &tree->rf_ci,
 					       ref_root_bh,
 					       start_cpos, clusters,
 					       &ref_blocks, credits);
@@ -2187,6 +2221,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
 		mlog_errno(ret);
 
 out:
+	ocfs2_set_ci_lock_trans(
+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci))->journal,
+		et->et_ci);
 	ocfs2_free_path(path);
 	return ret;
 }
@@ -2348,6 +2385,7 @@ out:
 static int ocfs2_replace_cow(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     struct buffer_head *ref_root_bh,
+			     struct ocfs2_caching_info *ref_ci,
 			     u32 cow_start, u32 cow_len,
 			     struct page **pages,
 			     int num_pages)
@@ -2372,7 +2410,7 @@ static int ocfs2_replace_cow(struct inode *inode,
 	context.num_pages = num_pages;
 	context.cow_start = cow_start;
 	context.cow_len = cow_len;
-	context.ref_ci = INODE_CACHE(inode);
+	context.ref_ci = ref_ci;
 	context.ref_root_bh = ref_root_bh;
 
 	context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1),
@@ -2480,7 +2518,7 @@ int ocfs2_refcount_cow(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh,
+	ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh, &ref_tree->rf_ci,
 				cow_start, cow_len, pages, num_pages);
 	if (ret)
 		mlog_errno(ret);
@@ -2611,7 +2649,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 			continue;
 
 		ret = ocfs2_add_refcount_flag(inode, &di_et,
-					      INODE_CACHE(inode), ref_root_bh,
+					      &ref_tree->rf_ci, ref_root_bh,
 					      cpos - num_clusters,
 					      p_cluster, num_clusters,
 					      &dealloc);
@@ -2694,6 +2732,7 @@ out:
 static int ocfs2_duplicate_extent_list(struct inode *s_inode,
 				struct inode *t_inode,
 				struct buffer_head *t_bh,
+				struct ocfs2_caching_info *ref_ci,
 				struct buffer_head *ref_root_bh,
 				struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
@@ -2715,8 +2754,7 @@ static int ocfs2_duplicate_extent_list(struct inode
*s_inode,
 
 		if (p_cluster) {
 			ret = ocfs2_add_refcounted_extent(t_inode, &et,
-							  INODE_CACHE(t_inode),
-							  ref_root_bh,
+							  ref_ci, ref_root_bh,
 							  cpos, p_cluster,
 							  num_clusters,
 							  ext_flags,
@@ -2806,7 +2844,8 @@ static int ocfs2_create_reflink_node(struct inode
*s_inode,
 	el = &di->id2.i_list;
 
 	ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
-					  ref_root_bh, &dealloc);
+					  &ref_tree->rf_ci, ref_root_bh,
+					  &dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_unlock_refcount;
@@ -3076,6 +3115,7 @@ static void ocfs2_insert_refcount_tree(struct ocfs2_super
*osb,
 static void ocfs2_free_refcount_tree(struct ocfs2_super *osb,
 				     struct ocfs2_refcount_tree *tree)
 {
+	ocfs2_metadata_cache_exit(&tree->rf_ci);
 	ocfs2_simple_drop_lockres(osb, &tree->rf_lockres);
 	ocfs2_lock_res_free(&tree->rf_lockres);
 	kfree(tree);
@@ -3113,8 +3153,12 @@ static int ocfs2_get_refcount_tree(struct ocfs2_super
*osb, u64 rf_blkno,
 	}
 
 	new->rf_blkno = rf_blkno;
+	new->rf_sb = osb->sb;
+	spin_lock_init(&new->rf_lock);
+	mutex_init(&new->rf_io_mutex);
 	init_rwsem(&new->rf_sem);
 	ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, rf_blkno);
+	ocfs2_metadata_cache_init(&new->rf_ci,
&ocfs2_refcount_caching_ops);
 
 	spin_lock(&osb->osb_lock);
 	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
@@ -3208,3 +3252,55 @@ void ocfs2_purge_refcount_tree(struct ocfs2_super *osb)
 		ocfs2_free_refcount_tree(osb, tree);
 	}
 }
+
+static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_blkno;
+}
+
+static struct super_block *
+ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	return rf->rf_sb;
+}
+
+static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_lock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	spin_unlock(&rf->rf_lock);
+}
+
+static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_lock(&rf->rf_io_mutex);
+}
+
+static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
+{
+	struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
+
+	mutex_unlock(&rf->rf_io_mutex);
+}
+
+static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
+	.co_owner		= ocfs2_refcount_cache_owner,
+	.co_get_super		= ocfs2_refcount_cache_get_super,
+	.co_cache_lock		= ocfs2_refcount_cache_lock,
+	.co_cache_unlock	= ocfs2_refcount_cache_unlock,
+	.co_io_lock		= ocfs2_refcount_cache_io_lock,
+	.co_io_unlock		= ocfs2_refcount_cache_io_unlock,
+};
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 5590127..b7fb077 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -22,7 +22,13 @@ struct ocfs2_refcount_tree {
 	u64 rf_blkno;
 	struct rw_semaphore rf_sem;
 	struct ocfs2_lock_res rf_lockres;
+
+	/* the following 4 fields are used by caching_info. */
 	struct ocfs2_caching_info rf_ci;
+	spinlock_t rf_lock;
+	struct mutex rf_io_mutex;
+	struct super_block *rf_sb;
+
 };
 
 static inline struct ocfs2_refcount_tree *
@@ -31,6 +37,12 @@ OCFS2_REF_ITEM(struct ocfs2_lock_res *res)
 	return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
 }
 
+static inline struct ocfs2_refcount_tree *
+cache_info_to_refcount(struct ocfs2_caching_info *ci)
+{
+	return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
+}
+
 int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
 int ocfs2_set_refcount_tree(struct inode *inode,
 			    struct buffer_head *di_bh,
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 25/39] ocfs2: Add refcount tree find mechanism from an inode.

xattr need refcount tree support, but some functions don't
have an easy access to the inode bh, so add a helper function
which read inode_bh first and then get the refcount tree.

And in order to speed up the refcount searching, we add a
LRU to the ocfs2_super so that we can find a recently used
refcount tree more quickly.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/ocfs2.h        |    1 +
 fs/ocfs2/refcounttree.c |   85 ++++++++++++++++++++++++++++++++++++++++-------
 fs/ocfs2/refcounttree.h |    1 -
 3 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3f62291..9ea1b30 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -383,6 +383,7 @@ struct ocfs2_super
 
 	/* rb tree root for refcount lock. */
 	struct rb_root	osb_rf_lock_tree;
+	struct ocfs2_refcount_tree *osb_ref_tree_lru;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4ef6bd2..a12d739 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3140,7 +3140,11 @@ static int ocfs2_get_refcount_tree(struct ocfs2_super
*osb, u64 rf_blkno,
 	struct ocfs2_refcount_tree *tree, *new = NULL;
 
 	spin_lock(&osb->osb_lock);
-	tree = ocfs2_find_refcount_tree(osb, rf_blkno);
+	if (osb->osb_ref_tree_lru &&
+	    osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
+		tree = osb->osb_ref_tree_lru;
+	else
+		tree = ocfs2_find_refcount_tree(osb, rf_blkno);
 	if (tree)
 		goto out;
 
@@ -3176,29 +3180,48 @@ out:
 
 	*ret_tree = tree;
 
+	osb->osb_ref_tree_lru = tree;
 	spin_unlock(&osb->osb_lock);
 
 	return ret;
 }
 
-/*
- * Lock the refcount tree pointed by ref_blkno and return the tree.
- * In most case, we lock the tree and read the refcount block.
- * So read it here if the caller really need it.
- */
-int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
-			     struct ocfs2_refcount_tree **ret_tree,
-			     struct buffer_head **ref_bh)
+/* Search refcount tree by inode. */
+static int ocfs2_get_refcount_tree_by_inode(struct inode *inode,
+					struct ocfs2_refcount_tree **ret_tree)
 {
 	int ret;
-	struct ocfs2_refcount_tree *tree = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+	/* We have never touch refcount tree for this inode. So let us do it. */
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	ret = ocfs2_get_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				      ret_tree);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
+				      struct ocfs2_refcount_tree *tree, int rw,
+				      struct ocfs2_refcount_tree **ret_tree,
+				      struct buffer_head **ref_bh)
+{
+	int ret;
+
 	ret = ocfs2_refcount_lock(&tree->rf_lockres, rw);
 	if (ret) {
 		mlog_errno(ret);
@@ -3212,7 +3235,7 @@ int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64
ref_blkno, int rw,
 
 	if (ref_bh) {
 		ret = ocfs2_read_refcount_block(&tree->rf_ci,
-						ref_blkno, ref_bh);
+						tree->rf_blkno, ref_bh);
 		if (ret) {
 			mlog_errno(ret);
 			ocfs2_unlock_refcount_tree(osb, tree, rw);
@@ -3225,6 +3248,44 @@ out:
 	return ret;
 }
 
+/*
+ * Lock the refcount tree pointed by ref_blkno and return the tree.
+ * In most case, we lock the tree and read the refcount block.
+ * So read it here if the caller really need it.
+ */
+int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
+			     struct ocfs2_refcount_tree **ret_tree,
+			     struct buffer_head **ref_bh)
+{
+	int ret;
+	struct ocfs2_refcount_tree *tree = NULL;
+
+	ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return __ocfs2_lock_refcount_tree(osb, tree, rw, ret_tree, ref_bh);
+}
+
+int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
+				      struct ocfs2_refcount_tree **ret_tree,
+				      struct buffer_head **ref_bh)
+{
+	int ret;
+	struct ocfs2_refcount_tree *tree = NULL;
+
+	ret = ocfs2_get_refcount_tree_by_inode(inode, &tree);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	return __ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), tree,
+					  rw, ret_tree, ref_bh);
+}
+
 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
 				struct ocfs2_refcount_tree *tree, int rw)
 {
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index b7fb077..ed86f65 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -28,7 +28,6 @@ struct ocfs2_refcount_tree {
 	spinlock_t rf_lock;
 	struct mutex rf_io_mutex;
 	struct super_block *rf_sb;
-
 };
 
 static inline struct ocfs2_refcount_tree *
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 26/39] ocfs2: Return extent flags for xattr value tree.

With the new refcount tree, xattr value can also be refcounted
among multiple files. So return the appropriate extent flags
so that CoW can used it later.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/extent_map.c |    6 +++++-
 fs/ocfs2/extent_map.h |    3 ++-
 fs/ocfs2/xattr.c      |    7 ++++---
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 40b5105..843db64 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -541,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block
*sb,
 
 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 			     u32 *p_cluster, u32 *num_clusters,
-			     struct ocfs2_extent_list *el)
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags)
 {
 	int ret = 0, i;
 	struct buffer_head *eb_bh = NULL;
@@ -593,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32
v_cluster,
 		*p_cluster = *p_cluster + coff;
 		if (num_clusters)
 			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+
+		if (extent_flags)
+			*extent_flags = rec->e_flags;
 	}
 out:
 	if (eb_bh)
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 9942f47..e79d41c 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -55,7 +55,8 @@ int ocfs2_fiemap(struct inode *inode, struct
fiemap_extent_info *fieinfo,
 
 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 			     u32 *p_cluster, u32 *num_clusters,
-			     struct ocfs2_extent_list *el);
+			     struct ocfs2_extent_list *el,
+			     unsigned int *extent_flags);
 
 int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
 			   struct buffer_head *bhs[], int flags,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dbc1e08..96c4d49 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -704,7 +704,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	while (trunc_len) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
 					       &alloc_size,
-					       &vb->vb_xv->xr_list);
+					       &vb->vb_xv->xr_list, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -959,7 +959,7 @@ static int ocfs2_xattr_get_value_outside(struct inode
*inode,
 	cpos = 0;
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
-					       &num_clusters, el);
+					       &num_clusters, el, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1197,7 +1197,8 @@ static int __ocfs2_xattr_set_value_outside(struct inode
*inode,
 
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
-					       &num_clusters, &xv->xr_list);
+					       &num_clusters, &xv->xr_list,
+					       NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 27/39] ocfs2: Abstract duplicate clusters process in CoW.

We currently use pagecache to duplicate clusters in CoW,
but it isn't suitable for xattr case. So abstract it out
so that the caller can decide which method it use.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/refcounttree.c |  141 ++++++++++++++++++++++++++---------------------
 1 files changed, 79 insertions(+), 62 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a12d739..0f94619 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -41,7 +41,7 @@
 
 struct ocfs2_cow_context {
 	struct inode *inode;
-	struct ocfs2_extent_tree di_et;
+	struct ocfs2_extent_tree data_et;
 	struct ocfs2_caching_info *ref_ci;
 	struct buffer_head *ref_root_bh;
 	struct ocfs2_alloc_context *meta_ac;
@@ -52,6 +52,14 @@ struct ocfs2_cow_context {
 	int num_pages;
 	u32 cow_start;
 	u32 cow_len;
+	int (*get_clusters)(struct ocfs2_cow_context *context,
+			    u32 v_cluster, u32 *p_cluster,
+			    u32 *num_clusters,
+			    unsigned int *extent_flags);
+	int (*cow_duplicate_clusters)(handle_t *handle,
+				      struct ocfs2_cow_context *context,
+				      u32 cpos, u32 old_cluster,
+				      u32 new_cluster, u32 new_len);
 };
 
 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops;
@@ -1830,7 +1838,7 @@ out:
  * use that value as the maximum clusters.
  */
 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
-					   struct buffer_head *di_bh,
+					   struct ocfs2_extent_list *el,
 					   u32 cpos,
 					   u32 write_len,
 					   u32 *cow_start,
@@ -1838,8 +1846,6 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode
*inode,
 					   int *has_data)
 {
 	int ret = 0;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-	struct ocfs2_extent_list *el = &di->id2.i_list;
 	int tree_height = le16_to_cpu(el->l_tree_depth), i;
 	struct buffer_head *eb_bh = NULL;
 	struct ocfs2_extent_block *eb = NULL;
@@ -2091,13 +2097,13 @@ out:
 	return ret;
 }
 
-static int ocfs2_duplicate_clusters(handle_t *handle,
-				    struct ocfs2_cow_context *context,
-				    u32 cpos, u32 old_cluster,
-				    u32 new_cluster, u32 new_len)
+static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
+					    struct ocfs2_cow_context *context,
+					    u32 cpos, u32 old_cluster,
+					    u32 new_cluster, u32 new_len)
 {
 	int ret = 0, bh_num;
-	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	int i, j, bpc = ocfs2_clusters_to_blocks(sb, 1);
 	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
@@ -2235,7 +2241,7 @@ static int ocfs2_replace_clusters(handle_t *handle,
 				  unsigned int ext_flags)
 {
 	int ret;
-	struct ocfs2_caching_info *ci = context->di_et.et_ci;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	u64 ino = ocfs2_metadata_cache_owner(ci);
 
 	mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags
%u\n",
@@ -2243,15 +2249,15 @@ static int ocfs2_replace_clusters(handle_t *handle,
 
 	/*If the old clusters is unwritten, no need to duplicate. */
 	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
-		ret = ocfs2_duplicate_clusters(handle, context, cpos,
-					       old, new, len);
+		ret = context->cow_duplicate_clusters(handle, context, cpos,
+						      old, new, len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
-	ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+	ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
 				       cpos, new, len, ext_flags,
 				       context->meta_ac, &context->dealloc);
 	if (ret)
@@ -2277,7 +2283,7 @@ static int ocfs2_make_clusters_writable(struct super_block
*sb,
 	     cpos, p_cluster, num_clusters, e_flags);
 
 	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
-					     &context->di_et,
+					     &context->data_et,
 					     context->ref_ci,
 					     context->ref_root_bh,
 					     &context->meta_ac,
@@ -2318,7 +2324,8 @@ static int ocfs2_make_clusters_writable(struct super_block
*sb,
 		 */
 		if (le32_to_cpu(rec.r_refcount) == 1) {
 			delete = 0;
-			ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
+			ret = ocfs2_clear_ext_refcount(handle,
+						       &context->data_et,
 						       cpos, p_cluster,
 						       set_len, e_flags,
 						       context->meta_ac,
@@ -2382,19 +2389,25 @@ out:
 	return ret;
 }
 
-static int ocfs2_replace_cow(struct inode *inode,
-			     struct buffer_head *di_bh,
-			     struct buffer_head *ref_root_bh,
-			     struct ocfs2_caching_info *ref_ci,
-			     u32 cow_start, u32 cow_len,
-			     struct page **pages,
-			     int num_pages)
+static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
+				 u32 v_cluster, u32 *p_cluster,
+				 u32 *num_clusters,
+				 unsigned int *extent_flags)
+{
+	struct inode *inode = context->inode;
+
+	return ocfs2_get_clusters(inode, v_cluster, p_cluster,
+				  num_clusters, extent_flags);
+}
+
+static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
 {
 	int ret;
-	u32 p_cluster, num_clusters, start = cow_start;
+	struct inode *inode = context->inode;
+	u32 cow_start = context->cow_start, cow_len = context->cow_len;
+	u32 p_cluster, num_clusters;
 	unsigned int ext_flags;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_cow_context context;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
@@ -2403,38 +2416,18 @@ static int ocfs2_replace_cow(struct inode *inode,
 		return -EROFS;
 	}
 
-	memset(&context, 0, sizeof(context));
-
-	context.inode = inode;
-	context.cow_pages = pages;
-	context.num_pages = num_pages;
-	context.cow_start = cow_start;
-	context.cow_len = cow_len;
-	context.ref_ci = ref_ci;
-	context.ref_root_bh = ref_root_bh;
-
-	context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1),
-			      sizeof(struct buffer_head *), GFP_NOFS);
-	if (!context.bhs) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	ocfs2_init_dealloc_ctxt(&context.dealloc);
-	ocfs2_init_dinode_extent_tree(&context.di_et,
-				      INODE_CACHE(inode), di_bh);
+	ocfs2_init_dealloc_ctxt(&context->dealloc);
 
 	while (cow_len) {
-		ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
-					 &num_clusters, &ext_flags);
+		ret = context->get_clusters(context, cow_start, &p_cluster,
+					    &num_clusters, &ext_flags);
 
 		BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
 
 		if (cow_len < num_clusters)
 			num_clusters = cow_len;
 
-		ret = ocfs2_make_clusters_writable(inode->i_sb, &context,
+		ret = ocfs2_make_clusters_writable(inode->i_sb, context,
 						   cow_start, p_cluster,
 						   num_clusters, ext_flags);
 		if (ret) {
@@ -2446,20 +2439,11 @@ static int ocfs2_replace_cow(struct inode *inode,
 		cow_start += num_clusters;
 	}
 
-
-	/*
-	 * truncate the extent map here since no matter whether we meet with
-	 * any error during the action, we shouldn't trust cached extent map
-	 * any more.
-	 */
-	ocfs2_extent_map_trunc(inode, start);
-
-	if (ocfs2_dealloc_has_cluster(&context.dealloc)) {
+	if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
 		ocfs2_schedule_truncate_log_flush(osb, 1);
-		ocfs2_run_deallocs(osb, &context.dealloc);
+		ocfs2_run_deallocs(osb, &context->dealloc);
 	}
 
-	kfree(context.bhs);
 	return ret;
 }
 
@@ -2476,11 +2460,15 @@ int ocfs2_refcount_cow(struct inode *inode,
 	struct page **pages = NULL;
 	struct ocfs2_refcount_tree *ref_tree;
 	loff_t start, end;
+	struct ocfs2_cow_context context;
+
+	memset(&context, 0, sizeof(context));
 
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 	BUG_ON(!di->i_refcount_loc);
 
-	ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len,
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
+					      cpos, write_len,
 					      &cow_start, &cow_len, &has_data);
 	if (ret) {
 		mlog_errno(ret);
@@ -2518,11 +2506,40 @@ int ocfs2_refcount_cow(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh, &ref_tree->rf_ci,
-				cow_start, cow_len, pages, num_pages);
+	context.inode = inode;
+	context.cow_start = cow_start;
+	context.cow_len = cow_len;
+	context.ref_ci = &ref_tree->rf_ci;
+	context.ref_root_bh = ref_root_bh;
+	context.cow_pages = pages;
+	context.num_pages = num_pages;
+	context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1),
+			      sizeof(struct buffer_head *), GFP_NOFS);
+	if (!context.bhs) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	context.cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
+	context.get_clusters = ocfs2_di_get_clusters;
+
+	ocfs2_init_dinode_extent_tree(&context.data_et,
+				      INODE_CACHE(inode), di_bh);
+
+	ret = ocfs2_replace_cow(&context);
 	if (ret)
 		mlog_errno(ret);
 
+	/*
+	 * truncate the extent map here since no matter whether we meet with
+	 * any error during the action, we shouldn't trust cached extent map
+	 * any more.
+	 */
+	ocfs2_extent_map_trunc(inode, cow_start);
+
+	kfree(context.bhs);
+out_unlock:
 	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
 out:
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 28/39] ocfs2: Add CoW support for xattr.

In order to make 2 transcation(xattr and cow) independent with each other,
we CoW the whole xattr out in case we are setting them.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c        |    2 +-
 fs/ocfs2/refcounttree.c |  250 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/refcounttree.h |   31 ++++++-
 fs/ocfs2/xattr.c        |  234 +++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 496 insertions(+), 21 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 013e600..2646f0c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6843,7 +6843,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 
 	if (delete_blk) {
 		if (rec_flags & OCFS2_EXT_REFCOUNTED)
-			status = ocfs2_decrease_refcount(inode, fe_bh, handle,
+			status = ocfs2_decrease_refcount(inode, handle,
 					ocfs2_blocks_to_clusters(osb->sb,
 								 delete_blk),
 					clusters_to_del, meta_ac,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 0f94619..96948b0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -33,6 +33,7 @@
 #include "extent_map.h"
 #include "aops.h"
 #include "namei.h"
+#include "xattr.h"
 
 #include <linux/security.h>
 #include <linux/quotaops.h>
@@ -50,6 +51,9 @@ struct ocfs2_cow_context {
 	struct buffer_head **bhs;
 	struct page **cow_pages;
 	int num_pages;
+	struct ocfs2_xattr_value_root *cow_xv;
+	struct ocfs2_post_refcount *post_refcount;
+	int extra_credits;
 	u32 cow_start;
 	u32 cow_len;
 	int (*get_clusters)(struct ocfs2_cow_context *context,
@@ -67,6 +71,8 @@ static void ocfs2_delete_refcount_tree(struct ocfs2_super
*osb,
 				       struct ocfs2_refcount_tree *tree);
 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
 				   struct ocfs2_refcount_tree **ret_tree);
+static int ocfs2_get_refcount_tree_by_inode(struct inode *inode,
+					struct ocfs2_refcount_tree **ret_tree);
 
 static int ocfs2_validate_refcount_block(struct super_block *sb,
 					 struct buffer_head *bh)
@@ -1542,7 +1548,7 @@ out:
 }
 
 /* Caller must hold refcount tree lock. */
-int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_decrease_refcount(struct inode *inode,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -1551,21 +1557,17 @@ int ocfs2_decrease_refcount(struct inode *inode, struct
buffer_head *di_bh,
 	int ret;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *ref_root_bh = NULL;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_refcount_tree *tree;
 
 	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
-	BUG_ON(!di->i_refcount_loc);
 
-	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
-				      le64_to_cpu(di->i_refcount_loc), &tree);
+	ret = ocfs2_get_refcount_tree_by_inode(inode, &tree);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_read_refcount_block(&tree->rf_ci,
-					le64_to_cpu(di->i_refcount_loc),
+	ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
 					&ref_root_bh);
 	if (ret) {
 		mlog_errno(ret);
@@ -2177,6 +2179,65 @@ out:
 	return ret;
 }
 
+static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
+					   struct ocfs2_cow_context *context,
+					   u32 cpos, u32 old_cluster,
+					   u32 new_cluster, u32 new_len)
+{
+	int ret = 0;
+	struct super_block *sb = context->inode->i_sb;
+	struct ocfs2_caching_info *ci = context->data_et.et_ci;
+	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
+	u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
+	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct buffer_head *old_bh = NULL;
+	struct buffer_head *new_bh = NULL;
+
+	mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
+	     new_cluster, new_len);
+
+	for (i = 0; i < blocks; i++, old_block++, new_block++) {
+		new_bh = sb_getblk(osb->sb, new_block);
+		if (new_bh == NULL) {
+			ret = -EIO;
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_set_new_buffer_uptodate(ci, new_bh);
+
+		ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_journal_access(handle, ci, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
+		ret = ocfs2_journal_dirty(handle, new_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		brelse(new_bh);
+		brelse(old_bh);
+		new_bh = NULL;
+		old_bh = NULL;
+	}
+
+	brelse(new_bh);
+	brelse(old_bh);
+	return ret;
+}
+
 static int ocfs2_clear_ext_refcount(handle_t *handle,
 				    struct ocfs2_extent_tree *et,
 				    u32 cpos, u32 p_cluster, u32 len,
@@ -2293,6 +2354,10 @@ static int ocfs2_make_clusters_writable(struct
super_block *sb,
 		return ret;
 	}
 
+	if (context->post_refcount)
+		credits += context->post_refcount->credits;
+
+	credits += context->extra_credits;
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -2373,6 +2438,14 @@ static int ocfs2_make_clusters_writable(struct
super_block *sb,
 		ref_leaf_bh = NULL;
 	}
 
+	/* handle some pos_cow case. */
+	if (context->post_refcount && context->post_refcount->func) {
+		ret = context->post_refcount->func(context->inode, handle,
+						context->post_refcount->para);
+		if (ret)
+			mlog_errno(ret);
+	}
+
 	ocfs2_commit_trans(osb, handle);
 
 out:
@@ -2550,6 +2623,169 @@ out:
 	return ret;
 }
 
+static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
+					  u32 v_cluster, u32 *p_cluster,
+					  u32 *num_clusters,
+					  unsigned int *extent_flags)
+{
+	struct inode *inode = context->inode;
+	struct ocfs2_xattr_value_root *xv = context->cow_xv;
+
+	return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
+					num_clusters, &xv->xr_list,
+					extent_flags);
+}
+
+/*
+ * Given a xattr value root, calculate the most meta/credits we need for
+ * refcount tree change if we truncate it to 0.
+ */
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits)
+{
+	int ret = 0, index, ref_blocks = 0;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_refcount_block *rb;
+	struct ocfs2_refcount_rec rec;
+	struct buffer_head *ref_leaf_bh = NULL;
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list,
+					       NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		cpos += num_clusters;
+
+		while (num_clusters) {
+			ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
+						     p_cluster, num_clusters,
+						     &rec, &index,
+						     &ref_leaf_bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!rec.r_refcount);
+
+			rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
+
+			/*
+			 * We really don't know whether the other clusters is in
+			 * this refcount block or not, so just take the worst
+			 * case that all the clusters are in this block and each
+			 * one will split a refcount rec, so totally we need
+			 * clusters * 2 new refcount rec.
+			 */
+			if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+			    le16_to_cpu(rb->rf_records.rl_count))
+				ref_blocks++;
+
+			*credits += 1;
+			brelse(ref_leaf_bh);
+			ref_leaf_bh = NULL;
+
+			if (num_clusters <= le32_to_cpu(rec.r_clusters))
+				break;
+			else
+				num_clusters -= le32_to_cpu(rec.r_clusters);
+			p_cluster += num_clusters;
+		}
+	}
+
+	*meta_add += ref_blocks;
+	if (!ref_blocks)
+		goto out;
+
+	rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
+	else {
+		struct ocfs2_extent_tree et;
+
+		ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
+		*credits += ocfs2_calc_extend_credits(inode->i_sb,
+						      et.et_root_el,
+						      ref_blocks);
+	}
+
+out:
+	brelse(ref_leaf_bh);
+	return ret;
+}
+
+/*
+ * Do CoW for xattr.
+ */
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_caching_info *ref_ci,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post)
+{
+	int ret, has_data;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_cow_context context;
+	u32 cow_start, cow_len;
+
+	memset(&context, 0, sizeof(context));
+
+	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+
+	ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
+					      cpos, write_len,
+					      &cow_start, &cow_len, &has_data);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	BUG_ON(cow_len == 0 || has_data == 0);
+
+	context.inode = inode;
+	context.cow_start = cow_start;
+	context.cow_len = cow_len;
+	context.ref_ci = ref_ci;
+	context.ref_root_bh = ref_root_bh;;
+	context.cow_xv = xv;
+	context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1),
+			      sizeof(struct buffer_head *), GFP_NOFS);
+	if (!context.bhs) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	context.cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
+	/* We need the extra credits for duplicate_clusters by jbd. */
+	context.extra_credits +		ocfs2_clusters_to_blocks(inode->i_sb, 1) *
cow_len;
+	context.get_clusters = ocfs2_xattr_value_get_clusters;
+	context.post_refcount = post;
+
+	ocfs2_init_xattr_value_extent_tree(&context.data_et,
+					   INODE_CACHE(inode), vb);
+
+	ret = ocfs2_replace_cow(&context);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	kfree(context.bhs);
+	return ret;
+}
+
 /*
  * Insert a new extent into refcount tree and mark a extent rec
  * as refcounted in the dinode tree.
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index ed86f65..1dda219 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -48,7 +48,7 @@ int ocfs2_set_refcount_tree(struct inode *inode,
 			    u64 blkno);
 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
 
-int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_decrease_refcount(struct inode *inode,
 			    handle_t *handle, u32 cpos, u32 len,
 			    struct ocfs2_alloc_context *meta_ac,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -71,4 +71,33 @@ int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64
ref_blkno, int rw,
 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
 				struct ocfs2_refcount_tree *tree,
 				int rw);
+
+typedef int (ocfs2_post_refcount_func)(struct inode *inode,
+				       handle_t *handle,
+				       void *para);
+/*
+ * Some refcount caller need to do more work after we modify the data b-tree
+ * during refcount operation(including CoW and add refcount flag), and make the
+ * transaction complete. So it must give us this structure so that we can do it
+ * within our transaction.
+ *
+ */
+struct ocfs2_post_refcount {
+	int credits;			/* credits it need for journal. */
+	ocfs2_post_refcount_func *func;	/* real function. */
+	void *para;
+};
+
+int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
+				       struct ocfs2_caching_info *ref_ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_xattr_value_root *xv,
+				       int *meta_add, int *credits);
+int ocfs2_refcount_cow_xattr(struct inode *inode,
+			     struct ocfs2_dinode *di,
+			     struct ocfs2_xattr_value_buf *vb,
+			     struct ocfs2_caching_info *ref_ci,
+			     struct buffer_head *ref_root_bh,
+			     u32 cpos, u32 write_len,
+			     struct ocfs2_post_refcount *post);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 96c4d49..b41598f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -55,7 +55,7 @@
 #include "buffer_head_io.h"
 #include "super.h"
 #include "xattr.h"
-
+#include "refcounttree.h"
 
 struct ocfs2_xattr_def_value_root {
 	struct ocfs2_xattr_value_root	xv;
@@ -176,6 +176,14 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode,
handle_t *handle,
 				  u64 src_blk, u64 last_blk, u64 to_blk,
 				  unsigned int start_bucket,
 				  u32 *first_hash);
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_need,
+					int *credits);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -647,6 +655,7 @@ leave:
 static int __ocfs2_remove_xattr_range(struct inode *inode,
 				      struct ocfs2_xattr_value_buf *vb,
 				      u32 cpos, u32 phys_cpos, u32 len,
+				      unsigned int ext_flags,
 				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
@@ -678,7 +687,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
+	if (ext_flags & OCFS2_EXT_REFCOUNTED)
+		ret = ocfs2_decrease_refcount(inode, handle,
+					ocfs2_blocks_to_clusters(inode->i_sb,
+								 phys_blkno),
+					len, ctxt->meta_ac, &ctxt->dealloc, 1);
+	else
+		ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
+						  phys_blkno, len);
 	if (ret)
 		mlog_errno(ret);
 
@@ -693,6 +709,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 				   struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret = 0;
+	unsigned int ext_flags;
 	u32 trunc_len, cpos, phys_cpos, alloc_size;
 	u64 block;
 
@@ -704,7 +721,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	while (trunc_len) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
 					       &alloc_size,
-					       &vb->vb_xv->xr_list, NULL);
+					       &vb->vb_xv->xr_list, &ext_flags);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -715,7 +732,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 
 		ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
 						 phys_cpos, alloc_size,
-						 ctxt);
+						 ext_flags, ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1181,7 +1198,7 @@ static int ocfs2_xattr_get(struct inode *inode,
 
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 					   handle_t *handle,
-					   struct ocfs2_xattr_value_root *xv,
+					   struct ocfs2_xattr_value_buf *vb,
 					   const void *value,
 					   int value_len)
 {
@@ -1192,18 +1209,22 @@ static int __ocfs2_xattr_set_value_outside(struct inode
*inode,
 	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
 	u64 blkno;
 	struct buffer_head *bh = NULL;
+	unsigned int ext_flags;
+	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
 
 	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
 
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
 					       &num_clusters, &xv->xr_list,
-					       NULL);
+					       &ext_flags);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
+		BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
@@ -1355,7 +1376,7 @@ static int ocfs2_xattr_set_value_outside(struct inode
*inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
 					      xi->value, xi->value_len);
 	if (ret < 0)
 		mlog_errno(ret);
@@ -1594,7 +1615,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 
 				ret = __ocfs2_xattr_set_value_outside(inode,
 								handle,
-								vb.vb_xv,
+								&vb,
 								xi->value,
 								xi->value_len);
 				if (ret < 0)
@@ -2430,6 +2451,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 				     struct ocfs2_xattr_search *xis,
 				     struct ocfs2_xattr_search *xbs,
 				     struct ocfs2_xattr_set_ctxt *ctxt,
+				     int extra_meta,
 				     int *credits)
 {
 	int clusters_add, meta_add, ret;
@@ -2446,6 +2468,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 		return ret;
 	}
 
+	meta_add += extra_meta;
 	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
 	     "credits = %d\n", xi->name, meta_add, clusters_add,
*credits);
 
@@ -2713,10 +2736,11 @@ int ocfs2_xattr_set(struct inode *inode,
 {
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
-	int ret, credits;
+	int ret, credits, ref_meta = 0, ref_credits = 0;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	struct ocfs2_xattr_info xi = {
 		.name_index = name_index,
@@ -2781,6 +2805,17 @@ int ocfs2_xattr_set(struct inode *inode,
 			goto cleanup;
 	}
 
+	/* Check whether the value is refcounted and do some prepartion. */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+	    (!xis.not_found || !xbs.not_found)) {
+		ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
+						   &xis, &xbs, &ref_tree,
+						   &ref_meta, &ref_credits);
+		if (ret) {
+			mlog_errno(ret);
+			goto cleanup;
+		}
+	}
 
 	mutex_lock(&tl_inode->i_mutex);
 
@@ -2795,7 +2830,7 @@ int ocfs2_xattr_set(struct inode *inode,
 	mutex_unlock(&tl_inode->i_mutex);
 
 	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
-					&xbs, &ctxt, &credits);
+					&xbs, &ctxt, ref_meta, &credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto cleanup;
@@ -2803,7 +2838,7 @@ int ocfs2_xattr_set(struct inode *inode,
 
 	/* we need to update inode's ctime field, so add credit for it. */
 	credits += OCFS2_INODE_UPDATE_CREDITS;
-	ctxt.handle = ocfs2_start_trans(osb, credits);
+	ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
 	if (IS_ERR(ctxt.handle)) {
 		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
@@ -2822,6 +2857,8 @@ int ocfs2_xattr_set(struct inode *inode,
 		ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 	ocfs2_inode_unlock(inode, 1);
 cleanup_nolock:
@@ -4800,6 +4837,9 @@ static int ocfs2_xattr_bucket_set_value_outside(struct
inode *inode,
 	struct ocfs2_xattr_entry *xe = xs->here;
 	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
 	void *base;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
 
 	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
 
@@ -4816,8 +4856,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct
inode *inode,
 	xv = (struct ocfs2_xattr_value_root *)(base + offset +
 		 OCFS2_XATTR_SIZE(xe->xe_name_len));
 
+	vb.vb_xv = xv;
+	vb.vb_bh = xs->bucket->bu_bhs[block_off];
 	ret = __ocfs2_xattr_set_value_outside(inode, handle,
-					      xv, val, value_len);
+					      &vb, val, value_len);
 	if (ret)
 		mlog_errno(ret);
 out:
@@ -5309,6 +5351,174 @@ out:
 }
 
 /*
+ * Whenever we modify a xattr value root in the bucket(e.g, CoW
+ * or change the extent record flag), we need to recalculate
+ * the metaecc for the whole bucket. So it is done here.
+ *
+ * Note:
+ * We have to give the extra credits for the caller.
+ */
+static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
+					    handle_t *handle,
+					    void *para)
+{
+	int ret;
+	struct ocfs2_xattr_bucket *bucket +			(struct ocfs2_xattr_bucket *)para;
+
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+
+	return 0;
+}
+
+/*
+ * Special action we need if the xattr value is refcounted.
+ *
+ * 1. If the xattr is refcounted, lock the tree.
+ * 2. CoW the xattr if we are setting the new value and the value
+ *    will be stored outside.
+ * 3. In other case, decrease_refcount will work for us, so just
+ *    lock the refcount tree, calculate the meta and credits is OK.
+ *
+ * We have to do CoW before ocfs2_init_xattr_set_ctxt since
+ * currently CoW is a completed transaction, while this function
+ * will also lock the allocators and let us deadlock. So we will
+ * CoW the whole xattr value.
+ */
+static int ocfs2_prepare_refcount_xattr(struct inode *inode,
+					struct ocfs2_dinode *di,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xis,
+					struct ocfs2_xattr_search *xbs,
+					struct ocfs2_refcount_tree **ref_tree,
+					int *meta_add,
+					int *credits)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_entry *xe;
+	char *base;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+	int name_offset, name_len;
+	struct ocfs2_xattr_value_buf vb;
+	struct ocfs2_xattr_bucket *bucket = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_post_refcount refcount;
+	struct ocfs2_post_refcount *p = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+		vb.vb_bh = xis->inode_bh;
+		vb.vb_access = ocfs2_journal_access_di;
+	} else {
+		int i, block_off = 0;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			base = bucket_block(xbs->bucket, block_off);
+			vb.vb_bh = xbs->bucket->bu_bhs[block_off];
+			vb.vb_access = ocfs2_journal_access;
+
+			if (ocfs2_meta_ecc(osb)) {
+				/*create parameters for ocfs2_post_refcount. */
+				bucket = xbs->bucket;
+				refcount.credits = bucket->bu_blocks;
+				refcount.para = bucket;
+				refcount.func +					ocfs2_xattr_bucket_post_refcount;
+				p = &refcount;
+			}
+		} else {
+			base = xbs->base;
+			vb.vb_bh = xbs->xattr_bh;
+			vb.vb_access = ocfs2_journal_access_xb;
+		}
+	}
+
+	if (ocfs2_xattr_is_local(xe))
+		goto out;
+
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+				(base + name_offset + name_len);
+
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters, &vb.vb_xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * We just need to check the 1st extent record, since we always
+	 * CoW the whole xattr. So there shouldn't be a xattr with
+	 * some REFCOUNT extent recs after the 1st one.
+	 */
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
+				       1, ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * If we are deleting the xattr or the new size will be stored inside,
+	 * cool, leave it there, the xattr truncate process will remove them
+	 * for us(it still needs the refcount tree lock and the meta, credits).
+	 * And the worse case is that every cluster truncate will split the
+	 * refcount tree, and make the original extent become 3. So we will need
+	 * 2 * cluster more extent recs at most.
+	 */
+	if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
+
+		ret = ocfs2_refcounted_xattr_delete_need(inode,
+							 &(*ref_tree)->rf_ci,
+							 ref_root_bh, vb.vb_xv,
+							 meta_add, credits);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
+				       &(*ref_tree)->rf_ci, ref_root_bh, 0,
+				       le32_to_cpu(vb.vb_xv->xr_clusters), p);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/*
  * 'security' attributes support
  */
 static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 29/39] ocfs2: Remove inode from ocfs2_xattr_bucket_get_name_value.

In ocfs2_xattr_bucket_get_name_value, actually we only use
super_block. So use it.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/xattr.c |   20 ++++++++++----------
 1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index b41598f..82f260b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -140,7 +140,7 @@ struct ocfs2_xattr_search {
 	int not_found;
 };
 
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
 					     struct ocfs2_xattr_header *xh,
 					     int index,
 					     int *block_off,
@@ -1100,7 +1100,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 		i = xs->here - xs->header->xh_entries;
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 								bucket_xh(xs->bucket),
 								i,
 								&block_off,
@@ -2296,7 +2296,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		old_in_xb = 1;
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 							bucket_xh(xbs->bucket),
 							i, &block_off,
 							&name_offset);
@@ -2971,7 +2971,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 		if (cmp)
 			continue;
 
-		ret = ocfs2_xattr_bucket_get_name_value(inode,
+		ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 							xh,
 							i,
 							&block_off,
@@ -3215,7 +3215,7 @@ struct ocfs2_xattr_tree_list {
 	size_t result;
 };
 
-static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
 					     struct ocfs2_xattr_header *xh,
 					     int index,
 					     int *block_off,
@@ -3228,8 +3228,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode
*inode,
 
 	name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
 
-	*block_off = name_offset >> inode->i_sb->s_blocksize_bits;
-	*new_offset = name_offset % inode->i_sb->s_blocksize;
+	*block_off = name_offset >> sb->s_blocksize_bits;
+	*new_offset = name_offset % sb->s_blocksize;
 
 	return 0;
 }
@@ -3249,7 +3249,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 		prefix = ocfs2_xattr_prefix(type);
 
 		if (prefix) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 								bucket_xh(bucket),
 								i,
 								&block_off,
@@ -4843,7 +4843,7 @@ static int ocfs2_xattr_bucket_set_value_outside(struct
inode *inode,
 
 	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
 
-	ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
+	ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
 						xe - xh->xh_entries,
 						&block_off,
 						&offset);
@@ -5431,7 +5431,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode
*inode,
 		i = xbs->here - xbs->header->xh_entries;
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode,
+			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
 							bucket_xh(xbs->bucket),
 							i, &block_off,
 							&name_offset);
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 30/39] ocfs2: Abstract the creation of xattr block.

In xattr reflink, we also need to create xattr block, so
abstract the process out.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/xattr.c |  115 +++++++++++++++++++++++++++++++++---------------------
 1 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 82f260b..e3910a1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2104,6 +2104,72 @@ cleanup:
 	return ret;
 }
 
+static int ocfs2_create_xattr_block(handle_t *handle,
+				    struct inode *inode,
+				    struct buffer_head *inode_bh,
+				    struct ocfs2_alloc_context *meta_ac,
+				    struct buffer_head **ret_bh)
+{
+	int ret;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)inode_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_xattr_block *xblk;
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+				   &suballoc_bit_start, &num_got,
+				   &first_blkno);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	new_bh = sb_getblk(inode->i_sb, first_blkno);
+	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
+				      new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+
+	/* Initialize ocfs2_xattr_block */
+	xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+	memset(xblk, 0, inode->i_sb->s_blocksize);
+	strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+	xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+	xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+	xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+	xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+	ret = ocfs2_journal_dirty(handle, new_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto end;
+	}
+	di->i_xattr_loc = cpu_to_le64(first_blkno);
+	ocfs2_journal_dirty(handle, inode_bh);
+
+	*ret_bh = new_bh;
+	new_bh = NULL;
+
+end:
+	brelse(new_bh);
+	return ret;
+}
+
 /*
  * ocfs2_xattr_block_set()
  *
@@ -2116,65 +2182,24 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct buffer_head *new_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	handle_t *handle = ctxt->handle;
 	struct ocfs2_xattr_block *xblk = NULL;
-	u16 suballoc_bit_start;
-	u32 num_got;
-	u64 first_blkno;
 	int ret;
 
 	if (!xs->xattr_bh) {
-		ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
-					      xs->inode_bh,
-					      OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto end;
-		}
-
-		ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
-					   &suballoc_bit_start, &num_got,
-					   &first_blkno);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto end;
-		}
-
-		new_bh = sb_getblk(inode->i_sb, first_blkno);
-		ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
-
-		ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
-					      new_bh,
-					      OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret < 0) {
+		ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
+					       ctxt->meta_ac, &new_bh);
+		if (ret) {
 			mlog_errno(ret);
 			goto end;
 		}
 
-		/* Initialize ocfs2_xattr_block */
 		xs->xattr_bh = new_bh;
-		xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
-		memset(xblk, 0, inode->i_sb->s_blocksize);
-		strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
-		xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
-		xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
-		xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
-		xblk->xb_blkno = cpu_to_le64(first_blkno);
-
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 		xs->header = &xblk->xb_attrs.xb_header;
 		xs->base = (void *)xs->header;
 		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
 		xs->here = xs->header->xh_entries;
-
-		ret = ocfs2_journal_dirty(handle, new_bh);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto end;
-		}
-		di->i_xattr_loc = cpu_to_le64(first_blkno);
-		ocfs2_journal_dirty(handle, xs->inode_bh);
 	} else
 		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 31/39] ocfs2: Abstract ocfs2 xattr tree extend rec iteration process.

Currently we have ocfs2_iterate_xattr_buckets which can receive
a para and a callback to iterate a series of bucket. It is good.
But actually the 2 callers ocfs2_xattr_tree_list_index_block and
ocfs2_delete_xattr_index_block are almost the same. The only
difference is that the latter need to handle the extent record
also. So add a new function named ocfs2_iterate_xattr_index_block.
It can be given func callback which are used for exten record.
So now we only have one iteration function for the xattr index
block. Ane what's more, it is useful for our future reflink
operations.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/xattr.c |  147 ++++++++++++++++++++++++++++--------------------------
 1 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e3910a1..527cfe8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -157,7 +157,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 					struct ocfs2_xattr_search *xs);
 
 static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
-					struct ocfs2_xattr_tree_root *xt,
+					struct buffer_head *blk_bh,
 					char *buffer,
 					size_t buffer_size);
 
@@ -170,8 +170,23 @@ static int ocfs2_xattr_set_entry_index_block(struct inode
*inode,
 					     struct ocfs2_xattr_search *xs,
 					     struct ocfs2_xattr_set_ctxt *ctxt);
 
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
-					  struct buffer_head *xb_bh);
+typedef int (xattr_tree_rec_func)(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno, u32 cpos, u32 len, void *para);
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *root_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para);
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para);
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len,
+				  void *para);
+
 static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
 				  u64 src_blk, u64 last_blk, u64 to_blk,
 				  unsigned int start_bucket,
@@ -870,11 +885,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
 		ret = ocfs2_xattr_list_entries(inode, header,
 					       buffer, buffer_size);
-	} else {
-		struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
-		ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+	} else
+		ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
 						   buffer, buffer_size);
-	}
 
 	brelse(blk_bh);
 
@@ -1800,7 +1813,10 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
 		ret = ocfs2_remove_value_outside(inode, &vb, header);
 	} else
-		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+		ret = ocfs2_iterate_xattr_index_block(inode,
+						blk_bh,
+						ocfs2_rm_xattr_cluster,
+						NULL);
 
 	return ret;
 }
@@ -3297,22 +3313,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 	return ret;
 }
 
-static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
-					     struct ocfs2_xattr_tree_root *xt,
-					     char *buffer,
-					     size_t buffer_size)
+static int ocfs2_iterate_xattr_index_block(struct inode *inode,
+					   struct buffer_head *blk_bh,
+					   xattr_tree_rec_func *rec_func,
+					   void *para)
 {
-	struct ocfs2_extent_list *el = &xt->xt_list;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)blk_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
 	int ret = 0;
 	u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
 	u64 p_blkno = 0;
-	struct ocfs2_xattr_tree_list xl = {
-		.buffer = buffer,
-		.buffer_size = buffer_size,
-		.result = 0,
-	};
 
-	if (le16_to_cpu(el->l_next_free_rec) == 0)
+	if (!el->l_next_free_rec || !rec_func)
 		return 0;
 
 	while (name_hash > 0) {
@@ -3320,15 +3333,14 @@ static int ocfs2_xattr_tree_list_index_block(struct
inode *inode,
 					  &e_cpos, &num_clusters, el);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			break;
 		}
 
-		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
-						  ocfs2_list_xattr_bucket,
-						  &xl);
+		ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
+			       num_clusters, para);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			break;
 		}
 
 		if (e_cpos == 0)
@@ -3337,6 +3349,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode
*inode,
 		name_hash = e_cpos - 1;
 	}
 
+	return ret;
+
+}
+
+static int ocfs2_list_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_list_xattr_bucket, para);
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					     struct buffer_head *blk_bh,
+					     char *buffer,
+					     size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_xattr_tree_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+		.result = 0,
+	};
+
+	ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+					      ocfs2_list_xattr_tree_rec, &xl);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = xl.result;
 out:
 	return ret;
@@ -4895,7 +4938,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 				  struct buffer_head *root_bh,
 				  u64 blkno,
 				  u32 cpos,
-				  u32 len)
+				  u32 len,
+				  void *para)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4907,6 +4951,13 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct ocfs2_extent_tree et;
 
+	ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					  ocfs2_delete_xattr_in_bucket, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
 	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
@@ -5329,52 +5380,6 @@ static int ocfs2_delete_xattr_in_bucket(struct inode
*inode,
 	return ret;
 }
 
-static int ocfs2_delete_xattr_index_block(struct inode *inode,
-					  struct buffer_head *xb_bh)
-{
-	struct ocfs2_xattr_block *xb -			(struct ocfs2_xattr_block *)xb_bh->b_data;
-	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
-	int ret = 0;
-	u32 name_hash = UINT_MAX, e_cpos, num_clusters;
-	u64 p_blkno;
-
-	if (le16_to_cpu(el->l_next_free_rec) == 0)
-		return 0;
-
-	while (name_hash > 0) {
-		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
-					  &e_cpos, &num_clusters, el);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
-						  ocfs2_delete_xattr_in_bucket,
-						  NULL);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
-		ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
-					     p_blkno, e_cpos, num_clusters);
-		if (ret) {
-			mlog_errno(ret);
-			break;
-		}
-
-		if (e_cpos == 0)
-			break;
-
-		name_hash = e_cpos - 1;
-	}
-
-out:
-	return ret;
-}
-
 /*
  * Whenever we modify a xattr value root in the bucket(e.g, CoW
  * or change the extent record flag), we need to recalculate
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 32/39] ocfs2: Attach xattr clusters to refcount tree.

In ocfs2, when xattr's value is larger than OCFS2_XATTR_INLINE_SIZE,
it will be kept outside of the blocks we store xattr entry. And they
are stored in a b-tree also. So this patch try to attach all these
clusters to refcount tree also.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/refcounttree.c |   40 +++++--
 fs/ocfs2/refcounttree.h |    7 +
 fs/ocfs2/xattr.c        |  291 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h        |    6 +-
 4 files changed, 334 insertions(+), 10 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 96948b0..5356a71 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2790,12 +2790,13 @@ out:
  * Insert a new extent into refcount tree and mark a extent rec
  * as refcounted in the dinode tree.
  */
-static int ocfs2_add_refcount_flag(struct inode *inode,
-				   struct ocfs2_extent_tree *di_et,
-				   struct ocfs2_caching_info *ref_ci,
-				   struct buffer_head *ref_root_bh,
-				   u32 cpos, u32 p_cluster, u32 num_clusters,
-				   struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post)
 {
 	int ret;
 	handle_t *handle;
@@ -2824,6 +2825,9 @@ static int ocfs2_add_refcount_flag(struct inode *inode,
 		}
 	}
 
+	if (post)
+		credits += post->credits;
+
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -2831,7 +2835,7 @@ static int ocfs2_add_refcount_flag(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_mark_extent_refcounted(inode, di_et, handle,
+	ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
 					   cpos, num_clusters, p_cluster,
 					   meta_ac, dealloc);
 	if (ret) {
@@ -2842,8 +2846,16 @@ static int ocfs2_add_refcount_flag(struct inode *inode,
 	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
 					p_cluster, num_clusters,
 					meta_ac, dealloc);
-	if (ret)
+	if (ret) {
 		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (post && post->func) {
+		ret = post->func(inode, handle, post->para);
+		if (ret)
+			mlog_errno(ret);
+	}
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
@@ -2905,7 +2917,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 					      &ref_tree->rf_ci, ref_root_bh,
 					      cpos - num_clusters,
 					      p_cluster, num_clusters,
-					      &dealloc);
+					      &dealloc, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			break;
@@ -2917,6 +2929,16 @@ static int ocfs2_attach_refcount_tree(struct inode
*inode,
 	 * record from the disk.
 	 */
 	ocfs2_extent_map_trunc(inode, 0);
+
+	if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_xattr_attach_refcount_tree(inode, fe_bh,
+						       &ref_tree->rf_ci,
+						       ref_root_bh,
+						       &dealloc);
+		if (ret)
+			mlog_errno(ret);
+	}
+
 	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	brelse(ref_root_bh);
 
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 1dda219..0f9b7a5 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -100,4 +100,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
 			     struct buffer_head *ref_root_bh,
 			     u32 cpos, u32 write_len,
 			     struct ocfs2_post_refcount *post);
+int ocfs2_add_refcount_flag(struct inode *inode,
+			    struct ocfs2_extent_tree *data_et,
+			    struct ocfs2_caching_info *ref_ci,
+			    struct buffer_head *ref_root_bh,
+			    u32 cpos, u32 p_cluster, u32 num_clusters,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc,
+			    struct ocfs2_post_refcount *post);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 527cfe8..0687852 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5549,6 +5549,297 @@ out:
 }
 
 /*
+ * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
+ * The physical clusters will be added to refcount tree.
+ */
+static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
+				struct ocfs2_xattr_value_root *xv,
+				struct ocfs2_extent_tree *value_et,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_post_refcount *refcount)
+{
+	int ret = 0;
+	u32 clusters = le32_to_cpu(xv->xr_clusters);
+	u32 cpos, p_cluster, num_clusters;
+	struct ocfs2_extent_list *el = &xv->xr_list;
+	unsigned int ext_flags;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el, &ext_flags);
+
+		cpos += num_clusters;
+		if ((ext_flags & OCFS2_EXT_REFCOUNTED))
+			continue;
+
+		BUG_ON(!p_cluster);
+
+		ret = ocfs2_add_refcount_flag(inode, value_et,
+					      ref_ci, ref_root_bh,
+					      cpos - num_clusters,
+					      p_cluster, num_clusters,
+					      dealloc, refcount);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Given a normal ocfs2_xattr_header, refcount all the entries which
+ * have value stored outside.
+ * Used for xattrs stored in inode and ocfs2_xattr_block.
+ */
+static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
+				struct ocfs2_xattr_value_buf *vb,
+				struct ocfs2_xattr_header *header,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_extent_tree et;
+	int i, ret = 0;
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		xe = &header->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		xv = (struct ocfs2_xattr_value_root *)((void *)header +
+			le16_to_cpu(xe->xe_name_offset) +
+			OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+		vb->vb_xv = xv;
+		ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
+							ref_ci, ref_root_bh,
+							dealloc, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
+				struct buffer_head *fe_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
+				(fe_bh->b_data + inode->i_sb->s_blocksize -
+				le16_to_cpu(di->i_xattr_inline_size));
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = fe_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+						  ref_ci, ref_root_bh, dealloc);
+}
+
+struct ocfs2_xattr_tree_value_refcount_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh)
+{
+	int ret, block_off, name_offset;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+	void *base;
+
+	ret = ocfs2_xattr_bucket_get_name_value(sb,
+						bucket_xh(bucket),
+						offset,
+						&block_off,
+						&name_offset);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	base = bucket_block(bucket, block_off);
+
+	*xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
+			 OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (bh)
+		*bh = bucket->bu_bhs[block_off];
+out:
+	return ret;
+}
+
+/*
+ * For a given xattr bucket, refcount all the entries which
+ * have value stored outside.
+ */
+static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_xattr_tree_value_refcount_para *ref +			(struct
ocfs2_xattr_tree_value_refcount_para *)para;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)bucket->bu_bhs[0]->b_data;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+	struct ocfs2_post_refcount refcount = {
+		.credits = bucket->bu_blocks,
+		.para = bucket,
+		.func = ocfs2_xattr_bucket_post_refcount,
+	};
+	struct ocfs2_post_refcount *p = NULL;
+
+	/* We only need post_refcount if we support metaecc. */
+	if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
+		p = &refcount;
+
+	mlog(0, "refcount bucket %llu, count = %u\n",
+	     (unsigned long long)bucket_blkno(bucket),
+	     le16_to_cpu(xh->xh_count));
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
+						      &vb.vb_xv, &vb.vb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_init_xattr_value_extent_tree(&et,
+						   INODE_CACHE(inode), &vb);
+
+		ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
+							&et, ref->ref_ci,
+							ref->ref_root_bh,
+							ref->dealloc, p);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+
+}
+
+static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
+				     struct buffer_head *root_bh,
+				     u64 blkno, u32 cpos, u32 len, void *para)
+{
+	return ocfs2_iterate_xattr_buckets(inode, blkno, len,
+					   ocfs2_xattr_bucket_value_refcount,
+					   para);
+}
+
+static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
+				struct buffer_head *blk_bh,
+				struct ocfs2_caching_info *ref_ci,
+				struct buffer_head *ref_root_bh,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret = 0;
+	struct ocfs2_xattr_block *xb +				(struct ocfs2_xattr_block
*)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		struct ocfs2_xattr_value_buf vb = {
+			.vb_bh = blk_bh,
+			.vb_access = ocfs2_journal_access_xb,
+		};
+
+		ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+	} else {
+		struct ocfs2_xattr_tree_value_refcount_para para = {
+			.ref_ci = ref_ci,
+			.ref_root_bh = ref_root_bh,
+			.dealloc = dealloc,
+		};
+
+		ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
+						ocfs2_refcount_xattr_tree_rec,
+						&para);
+	}
+
+	return ret;
+}
+
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
+							 ref_ci, ref_root_bh,
+							 dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out;
+
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
+						ref_root_bh, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+out:
+
+	return ret;
+}
+
+/*
  * 'security' attributes support
  */
 static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1ca7e9a..a3295d7 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -83,5 +83,9 @@ struct ocfs2_xattr_value_buf {
 	struct ocfs2_xattr_value_root	*vb_xv;
 };
 
-
+int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
+				     struct buffer_head *fe_bh,
+				     struct ocfs2_caching_info *ref_ci,
+				     struct buffer_head *ref_root_bh,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc);
 #endif /* OCFS2_XATTR_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 33/39] ocfs2: Call refcount tree remove process properly.

Now with xattr refcount support, we need to check whether
we have xattr refcounted before we remove the refcount tree.

Now the mechanism is:
1) Check whether i_clusters == 0, if no, exit.
2) check whether we have i_xattr_loc in dinode. if yes, exit.
2) Check whether we have inline xattr stored outside, if yes, exit.
4) Remove the tree.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c        |    5 -----
 fs/ocfs2/file.c         |    3 +++
 fs/ocfs2/inode.c        |    7 +++++++
 fs/ocfs2/refcounttree.c |   36 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.h |    2 ++
 fs/ocfs2/xattr.c        |   23 +++++++++++++++++++++++
 fs/ocfs2/xattr.h        |    2 ++
 7 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 2646f0c..82319e8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7441,11 +7441,6 @@ start:
 	goto start;
 
 bail:
-	if (!status && OCFS2_I(inode)->ip_clusters == 0) {
-		/* remove the refcount tree. */
-		status = ocfs2_remove_refcount_tree(inode, fe_bh);
-	}
-
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
 	if (tl_sem)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 44920a7..fef77e9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "quota.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -483,6 +484,8 @@ bail_unlock_sem:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
 bail:
+	if (!status && OCFS2_I(inode)->ip_clusters == 0)
+		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index e7c6b70..868f857 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "refcounttree.h"
 
 #include "buffer_head_io.h"
 
@@ -771,6 +772,12 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}
 
+	status = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
 	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
 				    orphan_dir_bh);
 	if (status < 0)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5356a71..b986c7c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -475,6 +475,42 @@ out:
 	return;
 }
 /*
+ * Try to remove refcount tree. The mechanism is:
+ * 1) Check whether i_clusters == 0, if no, exit.
+ * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
+ * 3) Check whether we have inline xattr stored outside, if yes, exit.
+ * 4) Remove the tree.
+ */
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&oi->ip_xattr_sem);
+	down_write(&oi->ip_alloc_sem);
+
+	if (oi->ip_clusters)
+		goto out;
+
+	if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) &&
di->i_xattr_loc)
+		goto out;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
+	    ocfs2_has_inline_xattr_value_outside(inode, di))
+		goto out;
+
+	ret = ocfs2_remove_refcount_tree(inode, di_bh);
+	if (ret)
+		mlog_errno(ret);
+out:
+	up_write(&oi->ip_alloc_sem);
+	up_write(&oi->ip_xattr_sem);
+	return 0;
+}
+
+/*
  * Given a cpos and len, try to find the refcount record which contains cpos.
  * 1. If cpos can be found in one refcount record, return the record.
  * 2. If cpos can't be found, return a fake record which start from cpos
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 0f9b7a5..014aaab 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -47,6 +47,8 @@ int ocfs2_set_refcount_tree(struct inode *inode,
 			    struct buffer_head *di_bh,
 			    u64 blkno);
 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
+int ocfs2_try_remove_refcount_tree(struct inode *inode,
+				   struct buffer_head *di_bh);
 
 int ocfs2_decrease_refcount(struct inode *inode,
 			    handle_t *handle, u32 cpos, u32 len,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 0687852..455f216 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -840,6 +840,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
 	return result;
 }
 
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di)
+{
+	struct ocfs2_xattr_header *xh;
+	int i;
+
+	xh = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
+		if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
+			return 1;
+
+	return 0;
+}
+
 static int ocfs2_xattr_ibody_list(struct inode *inode,
 				  struct ocfs2_dinode *di,
 				  char *buffer,
@@ -2897,10 +2914,16 @@ int ocfs2_xattr_set(struct inode *inode,
 	if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
 		ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
 cleanup:
 	if (ref_tree)
 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	if (!value && !ret) {
+		ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
 	ocfs2_inode_unlock(inode, 1);
 cleanup_nolock:
 	brelse(di_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index a3295d7..e74703f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct
buffer_head *,
 			   int, const char *, const void *, size_t, int,
 			   struct ocfs2_alloc_context *,
 			   struct ocfs2_alloc_context *);
+int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
+					 struct ocfs2_dinode *di);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 int ocfs2_init_security_get(struct inode *, struct inode *,
 			    struct ocfs2_security_xattr_info *);
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 34/39] ocfs2: Create an xattr indexed block if needed.

With reflink, there is a need that we create a new xattr indexed
block from the very beginning. So add a new parameter for
ocfs2_create_xattr_block.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/xattr.c |   16 ++++++++++++++--
 1 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 455f216..9876559 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2141,7 +2141,8 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 				    struct inode *inode,
 				    struct buffer_head *inode_bh,
 				    struct ocfs2_alloc_context *meta_ac,
-				    struct buffer_head **ret_bh)
+				    struct buffer_head **ret_bh,
+				    int indexed)
 {
 	int ret;
 	u16 suballoc_bit_start;
@@ -2187,6 +2188,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
 	xblk->xb_blkno = cpu_to_le64(first_blkno);
 
+	if (indexed) {
+		struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
+		xr->xt_clusters = cpu_to_le32(1);
+		xr->xt_last_eb_blk = 0;
+		xr->xt_list.l_tree_depth = 0;
+		xr->xt_list.l_count = cpu_to_le16(
+					ocfs2_xattr_recs_per_xb(inode->i_sb));
+		xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+		xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
+	}
+
 	ret = ocfs2_journal_dirty(handle, new_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2221,7 +2233,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 
 	if (!xs->xattr_bh) {
 		ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
-					       ctxt->meta_ac, &new_bh);
+					       ctxt->meta_ac, &new_bh, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto end;
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 35/39] ocfs2: Add reflink support for xattr.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/refcounttree.c |   36 ++-
 fs/ocfs2/refcounttree.h |    6 +
 fs/ocfs2/xattr.c        |  889 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h        |    4 +
 4 files changed, 922 insertions(+), 13 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b986c7c..4d86659 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1307,12 +1307,12 @@ out:
 	return ret;
 }
 
-static int __ocfs2_increase_refcount(handle_t *handle,
-				     struct ocfs2_caching_info *ci,
-				     struct buffer_head *ref_root_bh,
-				     u64 cpos, u32 len,
-				     struct ocfs2_alloc_context *meta_ac,
-				     struct ocfs2_cached_dealloc_ctxt *dealloc)
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0, index;
 	struct buffer_head *ref_leaf_bh = NULL;
@@ -2879,9 +2879,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
-					p_cluster, num_clusters,
-					meta_ac, dealloc);
+	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+				      p_cluster, num_clusters,
+				      meta_ac, dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -3026,9 +3026,9 @@ static int ocfs2_add_refcounted_extent(struct inode
*inode,
 		goto out_commit;
 	}
 
-	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
-					p_cluster, num_clusters,
-					meta_ac, dealloc);
+	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+				      p_cluster, num_clusters,
+				      meta_ac, dealloc);
 	if (ret)
 		mlog_errno(ret);
 
@@ -3200,9 +3200,17 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
 	}
 
 	ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh);
-	if (ret)
+	if (ret) {
 		mlog_errno(ret);
+		goto inode_unlock;
+	}
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
+		ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh);
+		if (ret)
+			mlog_errno(ret);
+	}
+inode_unlock:
 	ocfs2_inode_unlock(new_inode, 1);
 	brelse(new_bh);
 out_unlock:
@@ -3270,12 +3278,14 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry,
 		goto out_unlock;
 	}
 
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	vfs_dq_init(dir);
 	error = __ocfs2_reflink(old_dentry, old_bh, new_orphan_inode);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	if (error)
 		mlog_errno(error);
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 
 	ocfs2_inode_unlock(inode, 1);
 	brelse(old_bh);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 014aaab..f796118 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -109,4 +109,10 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 			    u32 cpos, u32 p_cluster, u32 num_clusters,
 			    struct ocfs2_cached_dealloc_ctxt *dealloc,
 			    struct ocfs2_post_refcount *post);
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc);
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9876559..c3a79fb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5875,6 +5875,895 @@ out:
 }
 
 /*
+ * Store the information we need in xattr reflink.
+ * old_bh and new_bh are inode bh for the old and new inode.
+ */
+struct ocfs2_xattr_reflink {
+	struct inode *old_inode;
+	struct inode *new_inode;
+	struct buffer_head *old_bh;
+	struct buffer_head *new_bh;
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+	struct ocfs2_cached_dealloc_ctxt *dealloc;
+};
+
+/*
+ * Given a xattr header and xe offset,
+ * return the proper xv and the corresponding bh.
+ * xattr in inode, block and xattr tree have different implementaions.
+ */
+typedef int (get_xattr_value_root)(struct super_block *sb,
+				   struct buffer_head *bh,
+				   struct ocfs2_xattr_header *xh,
+				   int offset,
+				   struct ocfs2_xattr_value_root **xv,
+				   struct buffer_head **ret_bh,
+				   void *para);
+
+/*
+ * Calculate all the xattr value root metadata stored in this xattr header and
+ * credits we need if we create them from the scratch.
+ * We use get_xattr_value_root so that all types of xattr container can use it.
+ */
+static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
+					     struct buffer_head *bh,
+					     struct ocfs2_xattr_header *xh,
+					     int *metas, int *credits,
+					     get_xattr_value_root *func,
+					     void *para)
+{
+	int i, ret = 0;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		*metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
+			  le16_to_cpu(xv->xr_list.l_next_free_rec);
+
+		*credits += ocfs2_calc_extend_credits(sb,
+						&def_xv.xv.xr_list,
+						le32_to_cpu(xv->xr_clusters));
+	}
+
+	return ret;
+}
+
+/* Used by xattr inode and block to return the right xv and buffer_head. */
+static int ocfs2_get_xattr_value_root(struct super_block *sb,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *xh,
+				      int offset,
+				      struct ocfs2_xattr_value_root **xv,
+				      struct buffer_head **ret_bh,
+				      void *para)
+{
+	struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
+
+	*xv = (struct ocfs2_xattr_value_root *)((void *)xh +
+		le16_to_cpu(xe->xe_name_offset) +
+		OCFS2_XATTR_SIZE(xe->xe_name_len));
+
+	if (ret_bh)
+		*ret_bh = bh;
+
+	return 0;
+}
+
+/*
+ * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
+ * It is only used for inline xattr and xattr block.
+ */
+static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
+					struct ocfs2_xattr_header *xh,
+					struct buffer_head *ref_root_bh,
+					int *credits,
+					struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, meta_add = 0;
+	struct ocfs2_refcount_block *rb +			(struct ocfs2_refcount_block
*)ref_root_bh->b_data;
+
+	*credits = 0;
+
+	ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
+						&meta_add, credits,
+						ocfs2_get_xattr_value_root,
+						NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	/*
+	 * We have to add credits for modifying all the metadata in the
+	 * refount tree.
+	 */
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
+	else
+		*credits += 1;
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a xattr header, reflink all the xattrs in this container.
+ * It can be used for inode, block and bucket.
+ *
+ * NOTE:
+ * Before we call this function, the caller has memcpy the xattr in
+ * old_xh to the new_xh.
+ */
+static int ocfs2_reflink_xattr_header(handle_t *handle,
+				      struct ocfs2_xattr_reflink *args,
+				      struct buffer_head *old_bh,
+				      struct ocfs2_xattr_header *xh,
+				      struct buffer_head *new_bh,
+				      struct ocfs2_xattr_header *new_xh,
+				      struct ocfs2_xattr_value_buf *vb,
+				      struct ocfs2_alloc_context *meta_ac,
+				      get_xattr_value_root *func,
+				      void *para)
+{
+	int ret = 0, i;
+	struct super_block *sb = args->old_inode->i_sb;
+	struct buffer_head *value_bh;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_value_root *xv, *new_xv;
+	struct ocfs2_extent_tree data_et;
+	u32 clusters, cpos, p_cluster, num_clusters;
+	unsigned int ext_flags = 0;
+
+	mlog(0, "reflink xattr in container %llu, count = %u\n",
+	     (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = func(sb, old_bh, xh, i, &xv, NULL, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = func(sb, new_bh, new_xh, i, &new_xv, &value_bh, para);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * For the xattr which has l_tree_depth = 0, all the extent
+		 * recs have already be copied to the new xh with the
+		 * propriate OCFS2_EXT_REFCOUNTED flag we just need to
+		 * increase the refount count int the refcount tree.
+		 *
+		 * For the xattr which has l_tree_depth > 0, we need
+		 * to initialize it to the empty default value root,
+		 * and then insert the extents one by one.
+		 */
+		if (xv->xr_list.l_tree_depth) {
+			memcpy(new_xv, &def_xv, sizeof(def_xv));
+			vb->vb_xv = new_xv;
+			vb->vb_bh = value_bh;
+			ocfs2_init_xattr_value_extent_tree(&data_et,
+					INODE_CACHE(args->new_inode), vb);
+		}
+
+		clusters = le32_to_cpu(xv->xr_clusters);
+		cpos = 0;
+		while (cpos < clusters) {
+			ret = ocfs2_xattr_get_clusters(args->old_inode,
+						       cpos,
+						       &p_cluster,
+						       &num_clusters,
+						       &xv->xr_list,
+						       &ext_flags);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			BUG_ON(!p_cluster);
+
+			if (xv->xr_list.l_tree_depth) {
+				ret = ocfs2_insert_extent(handle,
+						&data_et, cpos,
+						ocfs2_clusters_to_blocks(
+							args->old_inode->i_sb,
+							p_cluster),
+						num_clusters, ext_flags,
+						meta_ac);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+
+			ret = ocfs2_increase_refcount(handle, args->ref_ci,
+						      args->ref_root_bh,
+						      p_cluster, num_clusters,
+						      meta_ac, args->dealloc);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cpos += num_clusters;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
+	int inline_size = le16_to_cpu(di->i_xattr_inline_size);
+	int header_off = osb->sb->s_blocksize - inline_size;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
+					(args->old_bh->b_data + header_off);
+	struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
+					(args->new_bh->b_data + header_off);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_inode_info *new_oi;
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = args->new_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
+				      args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(args->new_bh->b_data + header_off,
+	       args->old_bh->b_data + header_off, inline_size);
+
+	new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+	new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
+					 args->new_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_oi = OCFS2_I(args->new_inode);
+	spin_lock(&new_oi->ip_lock);
+	new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
+	new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+	spin_unlock(&new_oi->ip_lock);
+
+	ocfs2_journal_dirty(handle, args->new_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_create_empty_xattr_block(struct inode *inode,
+					  struct buffer_head *fe_bh,
+					  struct buffer_head **ret_bh,
+					  int indexed)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "create new xattr block for inode %llu, index = %d\n",
+	     (unsigned long long)fe_bh->b_blocknr, indexed);
+	ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
+				       meta_ac, ret_bh, indexed);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
+				     struct buffer_head *blk_bh,
+				     struct buffer_head *new_blk_bh)
+{
+	int ret = 0, credits = 0;
+	handle_t *handle;
+	struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
+	struct ocfs2_dinode *new_di;
+	struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
+	int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)blk_bh->b_data;
+	struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_block *new_xb +			(struct ocfs2_xattr_block
*)new_blk_bh->b_data;
+	struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = new_blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
+
+	ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
+						  &credits, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* One more credits in case we need to add xattr flags in new inode. */
+	handle = ocfs2_start_trans(osb, credits + 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		ret = ocfs2_journal_access_di(handle,
+					      INODE_CACHE(args->new_inode),
+					      args->new_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
+				      new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
+	       osb->sb->s_blocksize - header_off);
+
+	ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
+					 new_blk_bh, new_xh, &vb, meta_ac,
+					 ocfs2_get_xattr_value_root, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_journal_dirty(handle, new_blk_bh);
+
+	if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
+		new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
+		spin_lock(&new_oi->ip_lock);
+		new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+		new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
+		spin_unlock(&new_oi->ip_lock);
+
+		ocfs2_journal_dirty(handle, args->new_bh);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+struct ocfs2_reflink_xattr_tree_args {
+	struct ocfs2_xattr_reflink *reflink;
+	struct buffer_head *old_blk_bh;
+	struct buffer_head *new_blk_bh;
+	struct ocfs2_xattr_bucket *old_bucket;
+	struct ocfs2_xattr_bucket *new_bucket;
+};
+
+/*
+ * NOTE:
+ * We have to handle the case that both old bucket and new bucket
+ * will call this function to get the right ret_bh.
+ * So The caller must give us the right bh.
+ */
+static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_reflink_xattr_tree_args *args +			(struct
ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_xattr_bucket *bucket;
+
+	if (bh == args->old_bucket->bu_bhs[0])
+		bucket = args->old_bucket;
+	else
+		bucket = args->new_bucket;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+struct ocfs2_value_tree_metas {
+	int num_metas;
+	int credits;
+};
+
+static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_header *xh,
+					int offset,
+					struct ocfs2_xattr_value_root **xv,
+					struct buffer_head **ret_bh,
+					void *para)
+{
+	struct ocfs2_xattr_bucket *bucket +				(struct ocfs2_xattr_bucket *)para;
+
+	return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
+					       xv, ret_bh);
+}
+
+static int ocfs2_calc_value_tree_metas(struct inode *inode,
+				      struct ocfs2_xattr_bucket *bucket,
+				      void *para)
+{
+	struct ocfs2_value_tree_metas *metas +			(struct ocfs2_value_tree_metas
*)para;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)bucket->bu_bhs[0]->b_data;
+
+	/* Add the credits for this bucket first. */
+	metas->credits += bucket->bu_blocks;
+	return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
+					xh, &metas->num_metas,
+					&metas->credits,
+					ocfs2_value_tree_metas_in_bucket,
+					bucket);
+}
+
+/*
+ * Given a xattr extent rec starting from blkno and having len clusters,
+ * iterate all the buckets calculate how much metadata we need for reflinking
+ * all the ocfs2_xattr_value_root and lock the allocators accordingly.
+ */
+static int ocfs2_lock_reflink_xattr_rec_allocators(
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *xt_et,
+				u64 blkno, u32 len, int *credits,
+				struct ocfs2_alloc_context **meta_ac,
+				struct ocfs2_alloc_context **data_ac)
+{
+	int ret, num_free_extents;
+	struct ocfs2_value_tree_metas metas;
+	struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
+	struct ocfs2_refcount_block *rb;
+
+	memset(&metas, 0, sizeof(metas));
+
+	ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
+					  ocfs2_calc_value_tree_metas, &metas);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	rb = (struct ocfs2_refcount_block
*)args->reflink->ref_root_bh->b_data;
+	/*
+	 * We have to add credits for modifying all the metadata in the
+	 * refount tree. We have to touch the root refcount block and
+	 * the refcount leaf block if necessary.
+	 */
+	*credits = metas.credits + 1;
+	if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
+		*credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
+			    le16_to_cpu(rb->rf_list.l_next_free_rec);
+
+	/* count in the xattr tree change. */
+	num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (num_free_extents < len)
+		metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
+
+	*credits += ocfs2_calc_extend_credits(osb->sb,
+					      xt_et->et_root_el, len);
+
+	if (metas.num_metas) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
+							meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (len) {
+		ret = ocfs2_reserve_clusters(osb, len, data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			meta_ac = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+				u64 blkno, u64 new_blkno, u32 clusters,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_reflink_xattr_tree_args *args)
+{
+	int i, j, ret = 0;
+	struct super_block *sb = args->reflink->old_inode->i_sb;
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+	u32 num_buckets = clusters * bpc;
+	int bpb = args->old_bucket->bu_blocks;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
+
+	for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
+		ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			num_buckets = le16_to_cpu(
+				bucket_xh(args->old_bucket)->xh_num_buckets);
+
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		for (j = 0; j < bpb; j++)
+			memcpy(bucket_block(args->new_bucket, j),
+			       bucket_block(args->old_bucket, j),
+			       sb->s_blocksize);
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
+		ret = ocfs2_reflink_xattr_header(handle, args->reflink,
+					args->old_bucket->bu_bhs[0],
+					bucket_xh(args->old_bucket),
+					args->new_bucket->bu_bhs[0],
+					bucket_xh(args->new_bucket),
+					&vb, meta_ac,
+					ocfs2_get_reflink_xattr_value_root,
+					args);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * Re-access and dirty the bucket to calculate metaecc.
+		 * Because we may extend the transaction in reflink_xattr_header
+		 * which will let the already accessed block gone.
+		 */
+		ret = ocfs2_xattr_bucket_journal_access(handle,
+						args->new_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+		ocfs2_xattr_bucket_relse(args->old_bucket);
+		ocfs2_xattr_bucket_relse(args->new_bucket);
+	}
+
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+	ocfs2_xattr_bucket_relse(args->new_bucket);
+	return ret;
+}
+/*
+ * Create the same xattr extent record in the new inode's xattr tree.
+ */
+static int ocfs2_reflink_xattr_rec(struct inode *inode,
+				   struct buffer_head *root_bh,
+				   u64 blkno,
+				   u32 cpos,
+				   u32 len,
+				   void *para)
+{
+	int ret, credits = 0;
+	u32 p_cluster, num_clusters;
+	u64 new_blkno;
+	handle_t *handle;
+	struct ocfs2_reflink_xattr_tree_args *args +			(struct
ocfs2_reflink_xattr_tree_args *)para;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_tree_extent_tree(&et,
+					  INODE_CACHE(args->reflink->new_inode),
+					  args->new_blk_bh);
+
+	ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
+						      len, &credits,
+						      &meta_ac, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_claim_clusters(osb, handle, data_ac,
+				   len, &p_cluster, &num_clusters);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
+
+	mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
+	     (unsigned long long)blkno, (unsigned long long)new_blkno, len);
+	ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
+					  meta_ac, data_ac, args);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+	     (unsigned long long)new_blkno, len, cpos);
+	ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
+				  len, 0, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	return ret;
+}
+
+/*
+ * Create reflinked xattr buckets.
+ * We will add bucket one by one, and refcount all the xattrs in the bucket
+ * if they are stored outside.
+ */
+static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
+				    struct buffer_head *blk_bh,
+				    struct buffer_head *new_blk_bh)
+{
+	int ret;
+	struct ocfs2_reflink_xattr_tree_args para;
+
+	memset(&para, 0, sizeof(para));
+	para.reflink = args;
+	para.old_blk_bh = blk_bh;
+	para.new_blk_bh = new_blk_bh;
+
+	para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
+	if (!para.old_bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
+	para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
+	if (!para.new_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
+					      ocfs2_reflink_xattr_rec,
+					      &para);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_xattr_bucket_free(para.old_bucket);
+	ocfs2_xattr_bucket_free(para.new_bucket);
+	return ret;
+}
+
+static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
+					struct buffer_head *blk_bh)
+{
+	int ret, indexed = 0;
+	struct buffer_head *new_blk_bh = NULL;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)blk_bh->b_data;
+
+
+	if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
+		indexed = 1;
+
+	ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
+					     &new_blk_bh, indexed);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+		ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
+	else
+		ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	brelse(new_blk_bh);
+	return ret;
+}
+
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh)
+{
+	int ret;
+	struct ocfs2_xattr_reflink args;
+	struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct buffer_head *ref_root_bh = NULL;
+
+	ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				       le64_to_cpu(di->i_refcount_loc),
+				       1, &ref_tree, &ref_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	args.old_inode = old_inode;
+	args.new_inode = new_inode;
+	args.old_bh = old_bh;
+	args.new_bh = new_bh;
+	args.ref_ci = &ref_tree->rf_ci;
+	args.ref_root_bh = ref_root_bh;
+	args.dealloc = &dealloc;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_reflink_xattr_inline(&args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+	}
+
+	if (!di->i_xattr_loc)
+		goto out_unlock;
+
+	ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	brelse(blk_bh);
+
+out_unlock:
+	ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
+				   ref_tree, 1);
+	brelse(ref_root_bh);
+
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
+		ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
+	}
+
+out:
+	return ret;
+}
+
+/*
  * 'security' attributes support
  */
 static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index e74703f..4f91305 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -90,4 +90,8 @@ int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
 				     struct ocfs2_caching_info *ref_ci,
 				     struct buffer_head *ref_root_bh,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_reflink_xattrs(struct inode *old_inode,
+			 struct buffer_head *old_bh,
+			 struct inode *new_inode,
+			 struct buffer_head *new_bh);
 #endif /* OCFS2_XATTR_H */
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 36/39] ocfs2: Modify removing xattr process for refcount.

The old xattr value remove is quite simple, it just erase the
tree and free the clusters. But as we have added refcount support,
The process is a little complicated.

We have to lock the refcount tree at the beginning, what's more,
we may split the refcount tree in some cases, so meta/credits are
needed.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/xattr.c |  190 +++++++++++++++++++++++++++++++++++++++++++----------
 1 files changed, 154 insertions(+), 36 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c3a79fb..b815171 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -199,6 +199,11 @@ static int ocfs2_prepare_refcount_xattr(struct inode
*inode,
 					struct ocfs2_refcount_tree **ref_tree,
 					int *meta_need,
 					int *credits);
+static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
+					   struct ocfs2_xattr_bucket *bucket,
+					   int offset,
+					   struct ocfs2_xattr_value_root **xv,
+					   struct buffer_head **bh);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -1751,51 +1756,112 @@ out:
 	return ret;
 }
 
+/*
+ * In xattr remove, if it is stored outside and refcounted, we may have
+ * the chance to split the refcount tree. So need the allocators.
+ */
+static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
+					struct ocfs2_xattr_value_root *xv,
+					struct ocfs2_caching_info *ref_ci,
+					struct buffer_head *ref_root_bh,
+					struct ocfs2_alloc_context **meta_ac,
+					int *ref_credits)
+{
+	int ret, meta_add = 0;
+	u32 p_cluster, num_clusters;
+	unsigned int ext_flags;
+
+	*ref_credits = 0;
+	ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
+				       &num_clusters,
+				       &xv->xr_list,
+				       &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
+		goto out;
+
+	ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
+						 ref_root_bh, xv,
+						 &meta_add, ref_credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
+						meta_add, meta_ac);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
 static int ocfs2_remove_value_outside(struct inode*inode,
 				      struct ocfs2_xattr_value_buf *vb,
-				      struct ocfs2_xattr_header *header)
+				      struct ocfs2_xattr_header *header,
+				      struct ocfs2_caching_info *ref_ci,
+				      struct buffer_head *ref_root_bh)
 {
-	int ret = 0, i;
+	int ret = 0, i, ref_credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	void *val;
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
-	ctxt.handle = ocfs2_start_trans(osb,
-					ocfs2_remove_extent_credits(osb->sb));
-	if (IS_ERR(ctxt.handle)) {
-		ret = PTR_ERR(ctxt.handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 
-		if (!ocfs2_xattr_is_local(entry)) {
-			void *val;
+		if (ocfs2_xattr_is_local(entry))
+			continue;
 
-			val = (void *)header +
-				le16_to_cpu(entry->xe_name_offset);
-			vb->vb_xv = (struct ocfs2_xattr_value_root *)
-				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-			ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
-			if (ret < 0) {
-				mlog_errno(ret);
-				break;
-			}
+		val = (void *)header +
+			le16_to_cpu(entry->xe_name_offset);
+		vb->vb_xv = (struct ocfs2_xattr_value_root *)
+			(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
+							 ref_ci, ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, ref_credits +
+					ocfs2_remove_extent_credits(osb->sb));
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
 		}
 	}
 
-	ocfs2_commit_trans(osb, ctxt.handle);
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
-out:
 	return ret;
 }
 
 static int ocfs2_xattr_ibody_remove(struct inode *inode,
-				    struct buffer_head *di_bh)
+				    struct buffer_head *di_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
 {
 
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1810,13 +1876,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
 		 ((void *)di + inode->i_sb->s_blocksize -
 		 le16_to_cpu(di->i_xattr_inline_size));
 
-	ret = ocfs2_remove_value_outside(inode, &vb, header);
+	ret = ocfs2_remove_value_outside(inode, &vb, header,
+					 ref_ci, ref_root_bh);
 
 	return ret;
 }
 
+struct ocfs2_rm_xattr_bucket_para {
+	struct ocfs2_caching_info *ref_ci;
+	struct buffer_head *ref_root_bh;
+};
+
 static int ocfs2_xattr_block_remove(struct inode *inode,
-				    struct buffer_head *blk_bh)
+				    struct buffer_head *blk_bh,
+				    struct ocfs2_caching_info *ref_ci,
+				    struct buffer_head *ref_root_bh)
 {
 	struct ocfs2_xattr_block *xb;
 	int ret = 0;
@@ -1824,22 +1898,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 		.vb_bh = blk_bh,
 		.vb_access = ocfs2_journal_access_xb,
 	};
+	struct ocfs2_rm_xattr_bucket_para args = {
+		.ref_ci = ref_ci,
+		.ref_root_bh = ref_root_bh,
+	};
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-		ret = ocfs2_remove_value_outside(inode, &vb, header);
+		ret = ocfs2_remove_value_outside(inode, &vb, header,
+						 ref_ci, ref_root_bh);
 	} else
 		ret = ocfs2_iterate_xattr_index_block(inode,
 						blk_bh,
 						ocfs2_rm_xattr_cluster,
-						NULL);
+						&args);
 
 	return ret;
 }
 
 static int ocfs2_xattr_free_block(struct inode *inode,
-				  u64 block)
+				  u64 block,
+				  struct ocfs2_caching_info *ref_ci,
+				  struct buffer_head *ref_root_bh)
 {
 	struct inode *xb_alloc_inode;
 	struct buffer_head *xb_alloc_bh = NULL;
@@ -1857,7 +1938,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_xattr_block_remove(inode, blk_bh);
+	ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1917,6 +1998,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct
buffer_head *di_bh)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_refcount_tree *ref_tree = NULL;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_caching_info *ref_ci = NULL;
 	handle_t *handle;
 	int ret;
 
@@ -1926,8 +2010,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct
buffer_head *di_bh)
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		return 0;
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+		ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
+					       le64_to_cpu(di->i_refcount_loc),
+					       1, &ref_tree, &ref_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ref_ci = &ref_tree->rf_ci;
+
+	}
+
 	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
-		ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh,
+					       ref_ci, ref_root_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1936,7 +2033,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct
buffer_head *di_bh)
 
 	if (di->i_xattr_loc) {
 		ret = ocfs2_xattr_free_block(inode,
-					     le64_to_cpu(di->i_xattr_loc));
+					     le64_to_cpu(di->i_xattr_loc),
+					     ref_ci, ref_root_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1970,6 +2068,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct
buffer_head *di_bh)
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
+	if (ref_tree)
+		ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
+	brelse(ref_root_bh);
 	return ret;
 }
 
@@ -4987,7 +5088,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	struct ocfs2_extent_tree et;
 
 	ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
-					  ocfs2_delete_xattr_in_bucket, NULL);
+					  ocfs2_delete_xattr_in_bucket, para);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -5376,7 +5477,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode
*inode,
 					struct ocfs2_xattr_bucket *bucket,
 					void *para)
 {
-	int ret = 0;
+	int ret = 0, ref_credits;
 	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u16 i;
 	struct ocfs2_xattr_entry *xe;
@@ -5384,7 +5485,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode
*inode,
 	struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
 	int credits = ocfs2_remove_extent_credits(osb->sb) +
 		ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_rm_xattr_bucket_para *args +			(struct ocfs2_rm_xattr_bucket_para
*)para;
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
@@ -5393,7 +5496,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode
*inode,
 		if (ocfs2_xattr_is_local(xe))
 			continue;
 
-		ctxt.handle = ocfs2_start_trans(osb, credits);
+		ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
+						      i, &xv, NULL);
+
+		ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
+							 args->ref_ci,
+							 args->ref_root_bh,
+							 &ctxt.meta_ac,
+							 &ref_credits);
+
+		ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
 		if (IS_ERR(ctxt.handle)) {
 			ret = PTR_ERR(ctxt.handle);
 			mlog_errno(ret);
@@ -5404,12 +5516,18 @@ static int ocfs2_delete_xattr_in_bucket(struct inode
*inode,
 							i, 0, &ctxt);
 
 		ocfs2_commit_trans(osb, ctxt.handle);
+		if (ctxt.meta_ac) {
+			ocfs2_free_alloc_context(ctxt.meta_ac);
+			ctxt.meta_ac = NULL;
+		}
 		if (ret) {
 			mlog_errno(ret);
 			break;
 		}
 	}
 
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 	return ret;
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 37/39] ocfs2: Don't merge in 1st refcount ops of reflink.

Actually the whole reflink will touch refcount tree 2 times:
1. It will add the clusters in the extent record to the tree if it
   isn't refcounted before.
2. It will add 1 refcount to these clusters when it add these
   extent records to the tree.

So actually we shouldn't do merge in the 1st operation since the 2nd
one will soon be called and we may have to split it again. Do a merge
first and split soon is a waste of time. So we only merge in the 2nd
round. This is done by adding a new internal __ocfs2_increase_refcount
and call it with "not-merge" for 1st refcount operation in reflink.

This also has a side-effect that we don't need to worry too much about
the metadata allocation in the 2nd round since it will only merge and
no split will happen for those records.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/refcounttree.c |   56 ++++++++++++++++++++++++++++++----------------
 1 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4d86659..86d4884 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -705,7 +705,7 @@ static void ocfs2_refcount_rec_merge(struct
ocfs2_refcount_block *rb,
 static int ocfs2_change_refcount_rec(handle_t *handle,
 				     struct ocfs2_caching_info *ci,
 				     struct buffer_head *ref_leaf_bh,
-				     int index, int change)
+				     int index, int merge, int change)
 {
 	int ret;
 	struct ocfs2_refcount_block *rb @@ -734,7 +734,7 @@ static int
ocfs2_change_refcount_rec(handle_t *handle,
 		}
 
 		le16_add_cpu(&rl->rl_used, -1);
-	} else
+	} else if (merge)
 		ocfs2_refcount_rec_merge(rb, index);
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
@@ -1109,7 +1109,7 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
 				     struct buffer_head *ref_root_bh,
 				     struct buffer_head *ref_leaf_bh,
 				     struct ocfs2_refcount_rec *rec,
-				     int index,
+				     int index, int merge,
 				     struct ocfs2_alloc_context *meta_ac)
 {
 	int ret;
@@ -1161,7 +1161,8 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
 
 	le16_add_cpu(&rf_list->rl_used, 1);
 
-	ocfs2_refcount_rec_merge(rb, index);
+	if (merge)
+		ocfs2_refcount_rec_merge(rb, index);
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
 	if (ret)
@@ -1186,7 +1187,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
 				    struct buffer_head *ref_root_bh,
 				    struct buffer_head *ref_leaf_bh,
 				    struct ocfs2_refcount_rec *split_rec,
-				    int index,
+				    int index, int merge,
 				    struct ocfs2_alloc_context *meta_ac,
 				    struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
@@ -1295,7 +1296,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
 
 	if (split_rec->r_refcount) {
 		rf_list->rl_recs[index] = *split_rec;
-		ocfs2_refcount_rec_merge(rb, index);
+		if (merge)
+			ocfs2_refcount_rec_merge(rb, index);
 	}
 
 	ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
@@ -1307,12 +1309,12 @@ out:
 	return ret;
 }
 
-int ocfs2_increase_refcount(handle_t *handle,
-			    struct ocfs2_caching_info *ci,
-			    struct buffer_head *ref_root_bh,
-			    u64 cpos, u32 len,
-			    struct ocfs2_alloc_context *meta_ac,
-			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+static int __ocfs2_increase_refcount(handle_t *handle,
+				     struct ocfs2_caching_info *ci,
+				     struct buffer_head *ref_root_bh,
+				     u64 cpos, u32 len, int merge,
+				     struct ocfs2_alloc_context *meta_ac,
+				     struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret = 0, index;
 	struct buffer_head *ref_leaf_bh = NULL;
@@ -1350,7 +1352,8 @@ int ocfs2_increase_refcount(handle_t *handle,
 			     "count %u\n", (unsigned long long)cpos, set_len,
 			     le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_change_refcount_rec(handle, ci,
-							ref_leaf_bh, index, 1);
+							ref_leaf_bh, index,
+							merge, 1);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -1363,7 +1366,8 @@ int ocfs2_increase_refcount(handle_t *handle,
 			     set_len);
 			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
 							ref_leaf_bh,
-							&rec, index, meta_ac);
+							&rec, index,
+							merge, meta_ac);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -1381,7 +1385,7 @@ int ocfs2_increase_refcount(handle_t *handle,
 			     set_len, le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_split_refcount_rec(handle, ci,
 						       ref_root_bh, ref_leaf_bh,
-						       &rec, index,
+						       &rec, index, merge,
 						       meta_ac, dealloc);
 			if (ret) {
 				mlog_errno(ret);
@@ -1459,6 +1463,18 @@ out:
 	return ret;
 }
 
+int ocfs2_increase_refcount(handle_t *handle,
+			    struct ocfs2_caching_info *ci,
+			    struct buffer_head *ref_root_bh,
+			    u64 cpos, u32 len,
+			    struct ocfs2_alloc_context *meta_ac,
+			    struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
+					 cpos, len, 1,
+					 meta_ac, dealloc);
+}
+
 static int ocfs2_decrease_refcount_rec(handle_t *handle,
 				struct ocfs2_caching_info *ci,
 				struct buffer_head *ref_root_bh,
@@ -1479,7 +1495,7 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
 	if (cpos == le64_to_cpu(rec->r_cpos) && cpos + len = 	   
le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters))
 		ret = ocfs2_change_refcount_rec(handle, ci,
-						ref_leaf_bh, index, -1);
+						ref_leaf_bh, index, 1, -1);
 	else {
 		struct ocfs2_refcount_rec split = *rec;
 		split.r_cpos = cpu_to_le64(cpos);
@@ -1495,7 +1511,7 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
 		     le32_to_cpu(rec->r_clusters));
 		ret = ocfs2_split_refcount_rec(handle, ci,
 					       ref_root_bh, ref_leaf_bh,
-					       &split, index,
+					       &split, index, 1,
 					       meta_ac, dealloc);
 	}
 
@@ -2879,9 +2895,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
-				      p_cluster, num_clusters,
-				      meta_ac, dealloc);
+	ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
+					p_cluster, num_clusters, 0,
+					meta_ac, dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 38/39] ocfs2: Make transaction extend more efficient.

In ocfs2_extend_rotate_transaction, op_credits is the orignal
credits in the handle and we only want to extend the credits
for the rotation, but the old solution always double it. It
is harmless for some minor operations, but for actions like
reflink we may rotate tree many times and cause the credits
increase dramatically. So this patch try to only increase
the desired credits.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 82319e8..5dc65d6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2256,10 +2256,18 @@ static int ocfs2_extend_rotate_transaction(handle_t
*handle, int subtree_depth,
 					   int op_credits,
 					   struct ocfs2_path *path)
 {
+	int ret;
 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
 
-	if (handle->h_buffer_credits < credits)
-		return ocfs2_extend_trans(handle, credits);
+	if (handle->h_buffer_credits < credits) {
+		ret = ocfs2_extend_trans(handle,
+					 credits - handle->h_buffer_credits);
+		if (ret)
+			return ret;
+
+		if (unlikely(handle->h_buffer_credits < credits))
+			return ocfs2_extend_trans(handle, credits);
+	}
 
 	return 0;
 }
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-29 22:58 UTC

head link

[Ocfs2-devel] [PATCH 39/39] ocfs2: Enable refcount tree support.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/ocfs2_fs.h |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 04adaf9..a4c6015 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -99,7 +99,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
 					 | OCFS2_FEATURE_INCOMPAT_XATTR \
 					 | OCFS2_FEATURE_INCOMPAT_META_ECC \
-					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
+					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
-- 
1.6.2.rc2.16.gf474c

Tao Ma

2009-Apr-30 07:59 UTC

head link

[Ocfs2-devel] [PATCH 00/39] ocfs2: Add reflink file support. V3

Hi all,
	So I have finally finished the v3 of reflink for ocfs2. The biggest 
change is that we support 64bit cluster offset now(Thank Mark and Joel 
for it).

[View]
http://oss.oracle.com/git/?p=tma/linux-2.6.git;a=shortlog;h=refcount
[Pull]
git://oss.oracle.com/git/tma/linux-2.6.git refcount

The general information for reflink, please see
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/Reflink.

For the design doc, please see
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/RefcountTrees
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/ReflinkOperation
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/ReflinkUses

The patch set is based on Joel's work of "ocfs2: Detach ocfs2 metadata
I/O from struct node" which can be found at
http://oss.oracle.com/pipermail/ocfs2-devel/2009-February/003926.html.

btw, some patches have ~1000 lines which make the review a little tough.
I will try to divide them into small parts in the next round. Please be 
patient.

Enjoy it.

Regards,
Tao

Christoph Hellwig

2009-May-01 07:34 UTC

head link

[Ocfs2-devel] [PATCH 17/39] ocfs2: Add ioctl for reflink.

On Thu, Apr 30, 2009 at 06:58:29AM +0800, Tao Ma wrote:> The ioctl will take 2 parameters: old_path and new_path and
> works like link.
As discussed during the FS/Storage summit please make this a proper
sys_reflink system call and don't add a stupid ioctl.

Joel Becker

2009-May-02 03:53 UTC

head link

[Ocfs2-devel] [PATCH 10/39] ocfs2: Add support for incrementing refcount in the tree.

On Thu, Apr 30, 2009 at 06:58:22AM +0800, Tao Ma wrote:> +
> +static void ocfs2_get_refcount_rec_from_list(struct ocfs2_caching_info
*ci,
> +					struct buffer_head *ref_leaf_bh,
> +					u64 cpos, unsigned int len,
> +					struct ocfs2_refcount_rec *ret_rec,
> +					int *index)
> +{
> +	int i = 0;
> +	struct ocfs2_refcount_block *rb > +		(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
> +	struct ocfs2_refcount_rec *rec = NULL;
> +
> +	for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
> +		rec = &rb->rf_records.rl_recs[i];
> +
> +		if (le64_to_cpu(rec->r_cpos) +
> +		    le32_to_cpu(rec->r_clusters) <= cpos)
> +			continue;
> +		else if (le64_to_cpu(rec->r_cpos) > cpos)
> +			break;
> +
> +		/* ok, cpos fail in this rec. Just return. */
> +		if (ret_rec)
> +			*ret_rec = *rec;
> +		goto out;
> +	}
> +
> +	if (ret_rec) {
> +		/* We meet with a hole here, so fake the rec. */
> +		ret_rec->r_cpos = cpu_to_le64(cpos);
> +		ret_rec->r_refcount = 0;
> +		if (i < le16_to_cpu(rb->rf_records.rl_used) &&
> +		    le64_to_cpu(rec->r_cpos) < cpos + len)
> +			ret_rec->r_clusters > +			
cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
> +		else
> +			ret_rec->r_clusters = cpu_to_le32(len);
> +	}
> +
> +out:
> +	*index = i;
> +	return;
> +}
	No need for 'return' at the end of a void function.
> +static int ocfs2_expand_inline_ref_root(handle_t *handle,
> +					struct ocfs2_caching_info *ci,
> +					struct buffer_head *ref_root_bh,
> +					struct buffer_head **ref_leaf_bh,
> +					struct ocfs2_alloc_context *meta_ac)
> +{
> +	int ret;
> +	u16 suballoc_bit_start;
> +	u32 num_got;
> +	u64 blkno;
> +	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
> +	struct buffer_head *new_bh = NULL;
> +	struct ocfs2_refcount_block *new_rb;
> +	struct ocfs2_refcount_block *root_rb > +			(struct
ocfs2_refcount_block *)ref_root_bh->b_data;
> +
> +	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
> +				      OCFS2_JOURNAL_ACCESS_WRITE);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
> +				   &suballoc_bit_start, &num_got,
> +				   &blkno);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	new_bh = sb_getblk(sb, blkno);
> +	if (new_bh == NULL) {
> +		ret = -EIO;
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +	ocfs2_set_new_buffer_uptodate(ci, new_bh);
> +
> +	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
> +				      OCFS2_JOURNAL_ACCESS_CREATE);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	/*
> +	 * Initialize ocfs2_refcount_block.
> +	 * It should contain the same information as the old root.
> +	 * so just memcpy it and change the corresponding field.
> +	 */
> +	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
> +
> +	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
> +	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
> +	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
> +	new_rb->rf_blkno = cpu_to_le64(blkno);
> +	new_rb->rf_cpos = cpu_to_le32(0);
> +	ocfs2_journal_dirty(handle, new_bh);
	The new leaf needs to set the LEAF flag and set rf_parent to the
root.
> +static void swap_refcount_rec(void *a, void *b, int size)
> +{
> +	struct ocfs2_refcount_rec *l = a, *r = b, tmp;
> +
> +	tmp = *l;
> +	memcpy(l, r, sizeof(struct ocfs2_refcount_rec));
> +	memcpy(r, &tmp, sizeof(struct ocfs2_refcount_rec));
	You don't need the memcpy's here.  You can just do:

	tmp = *l;
	*l = *r;
	*r = tmp;
> +static int ocfs2_new_leaf_refcount_block(handle_t *handle,
> +					 struct ocfs2_caching_info *ci,
> +					 struct buffer_head *ref_root_bh,
> +					 struct buffer_head *ref_leaf_bh,
> +					 struct ocfs2_alloc_context *meta_ac)
> +{
> +	int ret;
> +	u16 suballoc_bit_start;
> +	u32 num_got;
> +	u64 blkno;
> +	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
> +	struct ocfs2_refcount_block *root_rb > +			(struct
ocfs2_refcount_block *)ref_root_bh->b_data;
> +	struct buffer_head *new_bh = NULL;
> +	struct ocfs2_refcount_block *new_rb;
> +
> +	BUG_ON(!ref_root_bh);
> +	BUG_ON(!(le32_to_cpu(root_rb->rf_flags) &
OCFS2_REFCOUNT_TREE_FL));
> +
> +	ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
> +				      OCFS2_JOURNAL_ACCESS_WRITE);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
> +				      OCFS2_JOURNAL_ACCESS_WRITE);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
> +				   &suballoc_bit_start, &num_got,
> +				   &blkno);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	new_bh = sb_getblk(sb, blkno);
> +	if (new_bh == NULL) {
> +		ret = -EIO;
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +	ocfs2_set_new_buffer_uptodate(ci, new_bh);
> +
> +	ret = ocfs2_journal_access_rb(handle, ci, new_bh,
> +				      OCFS2_JOURNAL_ACCESS_CREATE);
> +	if (ret) {
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
> +	/* Initialize ocfs2_refcount_block. */
> +	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
> +	memset(new_rb, 0, sb->s_blocksize);
> +	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
> +	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
> +	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
> +	new_rb->rf_fs_generation =
cpu_to_le32(OCFS2_SB(sb)->fs_generation);
> +	new_rb->rf_blkno = cpu_to_le64(blkno);
> +	new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
> +	new_rb->rf_records.rl_count > +			
cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
	Need to set rf_parent to the root of the btree.
> +static int ocfs2_split_refcount_rec(handle_t *handle,
> +				    struct ocfs2_caching_info *ci,
> +				    struct buffer_head *ref_root_bh,
> +				    struct buffer_head *ref_leaf_bh,
> +				    struct ocfs2_refcount_rec *split_rec,
> +				    int index,
> +				    struct ocfs2_alloc_context *meta_ac,
> +				    struct ocfs2_cached_dealloc_ctxt *dealloc)
> +{
> +	int ret, recs_need;
> +	u32 len;
> +	struct ocfs2_refcount_block *rb > +			(struct ocfs2_refcount_block
*)ref_leaf_bh->b_data;
> +	struct ocfs2_refcount_list *rf_list = &rb->rf_records;
> +	struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
> +	struct ocfs2_refcount_rec *tail_rec = NULL;
> +	struct buffer_head *new_bh = NULL;
> +
> +	BUG_ON(!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_LEAF_FL));
	This should be:

	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);

Also, split_refcount_rec could probably use a couple comments at each
stage.  Short hings like "ok, remove the entries we just moved over to
the other block".

Joel

-- 

Life's Little Instruction Book #337

	"Reread your favorite book."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

Joel Becker

2009-May-08 01:30 UTC

head link

[Ocfs2-devel] [PATCH 23/39] ocfs2: lock refcount tree if needed.

On Thu, Apr 30, 2009 at 06:58:35AM +0800, Tao Ma wrote:> Signed-off-by: Tao Ma <tao.ma at oracle.com>
Could use a better description :-)

Joel

-- 

"We will have to repent in this generation not merely for the
 vitriolic words and actions of the bad people, but for the 
 appalling silence of the good people."
	- Rev. Dr. Martin Luther King, Jr.

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

Apparently Analagous Threads

Search for more maybe matching threads

Ocfs2 devel - Apr 2009 - [PATCH 00/39] ocfs2: Add reflink file support. V3

[Ocfs2-devel] [PATCH 01/39] ocfs2: Define refcount tree structure.

[Ocfs2-devel] [PATCH 02/39] ocfs2: Add metaecc for ocfs2_refcount_block.

[Ocfs2-devel] [PATCH 03/39] ocfs2: Add ocfs2_read_refcount_block.

[Ocfs2-devel] [PATCH 04/39] ocfs2: Basic tree root operation.

[Ocfs2-devel] [PATCH 05/39] ocfs2: hook remove refcount tree into truncate.

[Ocfs2-devel] [PATCH 06/39] ocfs2: Wrap ocfs2_extent_contig in ocfs2_extent_tree.

[Ocfs2-devel] [PATCH 07/39] ocfs2: Abstract extent split process.

[Ocfs2-devel] [PATCH 08/39] ocfs2: Add refcount b-tree as a new extent tree.

[Ocfs2-devel] [PATCH 09/39] ocfs2: export tree operation functions.

[Ocfs2-devel] [PATCH 10/39] ocfs2: Add support for incrementing refcount in the tree.

[Ocfs2-devel] [PATCH 11/39] ocfs2: Add support of decrementing refcount for delete.

[Ocfs2-devel] [PATCH 12/39] ocfs2: Add functions for extents refcounted.

[Ocfs2-devel] [PATCH 13/39] ocfs2: Hook 'Decrement refcount for delete'.

[Ocfs2-devel] [PATCH 14/39] ocfs2: Add CoW support.

[Ocfs2-devel] [PATCH 15/39] ocfs2: CoW refcount tree improvement.

[Ocfs2-devel] [PATCH 16/39] ocfs2: Add __ocfs2_reflink.

[Ocfs2-devel] [PATCH 17/39] ocfs2: Add ioctl for reflink.

[Ocfs2-devel] [PATCH 18/39] ocfs2: Use proper parameter for some inode operation.

[Ocfs2-devel] [PATCH 19/39] ocfs2: Create reflinked file in orphan dir.

[Ocfs2-devel] [PATCH 20/39] ocfs2: Abstract caching info checkpoint.

[Ocfs2-devel] [PATCH 21/39] ocfs2: Add new refcount tree lock resource.

[Ocfs2-devel] [PATCH 22/39] ocfs2: Add refcount tree lock mechanism.

[Ocfs2-devel] [PATCH 23/39] ocfs2: lock refcount tree if needed.

[Ocfs2-devel] [PATCH 24/39] ocfs2: Add caching info for refcount tree.

[Ocfs2-devel] [PATCH 25/39] ocfs2: Add refcount tree find mechanism from an inode.

[Ocfs2-devel] [PATCH 26/39] ocfs2: Return extent flags for xattr value tree.

[Ocfs2-devel] [PATCH 27/39] ocfs2: Abstract duplicate clusters process in CoW.

[Ocfs2-devel] [PATCH 28/39] ocfs2: Add CoW support for xattr.

[Ocfs2-devel] [PATCH 29/39] ocfs2: Remove inode from ocfs2_xattr_bucket_get_name_value.

[Ocfs2-devel] [PATCH 30/39] ocfs2: Abstract the creation of xattr block.

[Ocfs2-devel] [PATCH 31/39] ocfs2: Abstract ocfs2 xattr tree extend rec iteration process.

[Ocfs2-devel] [PATCH 32/39] ocfs2: Attach xattr clusters to refcount tree.

[Ocfs2-devel] [PATCH 33/39] ocfs2: Call refcount tree remove process properly.

[Ocfs2-devel] [PATCH 34/39] ocfs2: Create an xattr indexed block if needed.

[Ocfs2-devel] [PATCH 35/39] ocfs2: Add reflink support for xattr.

[Ocfs2-devel] [PATCH 36/39] ocfs2: Modify removing xattr process for refcount.

[Ocfs2-devel] [PATCH 37/39] ocfs2: Don't merge in 1st refcount ops of reflink.

[Ocfs2-devel] [PATCH 38/39] ocfs2: Make transaction extend more efficient.

[Ocfs2-devel] [PATCH 39/39] ocfs2: Enable refcount tree support.

[Ocfs2-devel] [PATCH 00/39] ocfs2: Add reflink file support. V3

[Ocfs2-devel] [PATCH 17/39] ocfs2: Add ioctl for reflink.

[Ocfs2-devel] [PATCH 10/39] ocfs2: Add support for incrementing refcount in the tree.

[Ocfs2-devel] [PATCH 23/39] ocfs2: lock refcount tree if needed.

Apparently Analagous Threads