thr3ads.net - Ocfs2 devel - [Ocfs2-devel] [PATCH 0/8] ocfs2: Add extended attributes for ocfs2. V1 [Jun 2008]

If this information is useful, please help other people find it:
Share via:

Tao Ma

2008-Jun-05 07:16 UTC

[Ocfs2-devel] [PATCH 0/8] ocfs2: Add extended attributes for ocfs2. V1

Hi all,
	Extended attributes are used for storing POSIX ACLs, SELinux labels, 
and user accessible metadata. They are essential for deploying file 
systems exported for workgroup use via samba. The following patches 
implement extended attributes on the OCFS2 file system.

Patch 1-4: The refactoring of extent tree operation. xattr will use 
ocfs2_extent_list for both the large EA and large numbers of EA. So we 
need some refactoring so that the old code can work with them.

Patch 5: Add ocfs2 xattr header.

Patch 6: Add operations for storing large EA in ocfs2. This is also 
refactor the old extent tree a little.

Patch 7: The extended attributes for ocfs2. It enables extended 
attributes storage for both in-inode and one meta block. The main design 
doc can be found at 
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/ExtendedAttributes.

Patch 8: Enables storing large numbers of EAs in ocfs2. The main design 
doc can be found at 
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/IndexedEATrees. The main 
process is almost the same, but some difference is described in detail 
in that patch.

Regards,
Tiger and Tao

Tiger Yang

2008-Jun-05 07:24 UTC

head link

[Ocfs2-devel] [PATCH 7/8] ocfs2: Add extended attributes support. v1

This patch implement storing extended attributes both in inode or metadata
block.
For EAs in inode, we reserve the last 256 bytes in inode block(blocksize >=
1024).
When EAs value size large than 80 bytes, we will store the value via b-tree
outside inode or block.

Signed-off-by: Tiger Yang <tiger.yang at oracle.com>
---
 fs/ocfs2/Makefile        |    2 +
 fs/ocfs2/file.c          |    5 +
 fs/ocfs2/inode.c         |    8 +
 fs/ocfs2/namei.c         |    5 +
 fs/ocfs2/ocfs2.h         |    1 +
 fs/ocfs2/ocfs2_fs.h      |   43 ++-
 fs/ocfs2/super.c         |   12 +
 fs/ocfs2/xattr.c         | 1342 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/xattr.h         |   27 +
 fs/ocfs2/xattr_trusted.c |   79 +++
 fs/ocfs2/xattr_user.c    |   91 ++++
 11 files changed, 1600 insertions(+), 15 deletions(-)
 create mode 100644 fs/ocfs2/xattr_trusted.c
 create mode 100644 fs/ocfs2/xattr_user.c

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index af63980..21323da 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -36,6 +36,8 @@ ocfs2-objs := \
 	uptodate.o		\
 	ver.o			\
 	xattr.o			\
+	xattr_user.o		\
+	xattr_trusted.o
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e138fec..b2b96b3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -2070,6 +2071,10 @@ const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 	.fallocate	= ocfs2_fallocate,
 };
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c7..da4b013 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -730,6 +731,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}
 
+	/*Free extended attribute resources associated with this inode.*/
+	status = ocfs2_xattr_remove(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
 	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
 				    orphan_dir_bh);
 	if (status < 0)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808f..b938b00 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -1918,4 +1919,8 @@ const struct inode_operations ocfs2_dir_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3169237..f567215 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -172,6 +172,7 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 52c4266..d0fbf38 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -291,6 +291,9 @@ struct ocfs2_new_group_input {
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE		(4 * 1024 * 1024)
 
+/* Inline extended attribute size (in bytes) */
+#define OCFS2_MAX_XATTR_INLINE_SIZE	256
+
 /*
  * Default local alloc size (in megabytes)
  *
@@ -640,11 +643,12 @@ struct ocfs2_dinode {
 	__le32 i_atime_nsec;
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
-	__le32 i_attr;
+/*70*/	__le32 i_attr;
 	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
 					   was set in i_flags */
 	__le16 i_dyn_features;
-/*70*/	__le64 i_reserved2[8];
+	__le64 i_xattr_loc;
+/*80*/	__le64 i_reserved2[7];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -721,16 +725,26 @@ static inline int ocfs2_fast_symlink_chars(struct
super_block *sb)
 
 static inline int ocfs2_max_inline_data(struct super_block *sb)
 {
-	return sb->s_blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+	if (sb->s_blocksize != OCFS2_MIN_BLOCKSIZE)
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			OCFS2_MAX_XATTR_INLINE_SIZE;
+	else
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
 
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
 	int size;
 
-	size = sb->s_blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+	if (sb->s_blocksize != OCFS2_MIN_BLOCKSIZE)
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			OCFS2_MAX_XATTR_INLINE_SIZE;
+	else
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
 
 	return size / sizeof(struct ocfs2_extent_rec);
 }
@@ -806,15 +820,26 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
 
 static inline int ocfs2_max_inline_data(int blocksize)
 {
-	return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+	if (blocksize != OCFS2_MIN_BLOCKSIZE)
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			OCFS2_MAX_XATTR_INLINE_SIZE;
+	else
+		return blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
 
 static inline int ocfs2_extent_recs_per_inode(int blocksize)
 {
 	int size;
 
-	size = blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+	if (blocksize != OCFS2_MIN_BLOCKSIZE)
+		size = blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			OCFS2_MAX_XATTR_INLINE_SIZE;
+	else
+		size = blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
 
 	return size / sizeof(struct ocfs2_extent_rec);
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index df63ba2..2628dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -154,6 +155,8 @@ enum {
 	Opt_localalloc,
 	Opt_localflocks,
 	Opt_stack,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
 	Opt_err,
 };
 
@@ -173,6 +176,8 @@ static match_table_t tokens = {
 	{Opt_localalloc, "localalloc=%d"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_stack, "cluster_stack=%s"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_err, NULL}
 };
 
@@ -847,6 +852,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_data_writeback:
 			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
+		case Opt_user_xattr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_nouser_xattr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+			break;
 		case Opt_atime_quantum:
 			if (match_int(&args[0], &option)) {
 				status = 0;
@@ -1375,6 +1386,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c223ab0..ed07448 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -21,6 +21,19 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
@@ -28,6 +41,7 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "sysfile.h"
 #include "inode.h"
 #include "journal.h"
 #include "ocfs2_fs.h"
@@ -36,6 +50,129 @@
 #include "buffer_head_io.h"
 #include "xattr.h"
 
+
+#define OCFS2_XATTR_PAD_BITS	2
+#define OCFS2_XATTR_PAD		4
+#define OCFS2_XATTR_ROUND	(OCFS2_XATTR_PAD-1)
+#define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
+				~OCFS2_XATTR_ROUND)
+#define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_NAME_HASH_SHIFT	5
+#define OCFS2_VALUE_HASH_SHIFT	16
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+	.xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+struct xattr_handler *ocfs2_xattr_handlers[] = {
+	&ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	&ocfs2_xattr_acl_access_handler,
+	&ocfs2_xattr_acl_default_handler,
+#endif
+	&ocfs2_xattr_trusted_handler,
+#ifdef CONFIG_OCFS2_FS_LUSTRE
+	&ocfs2_xattr_lustre_handler,
+#endif
+#ifdef CONFIG_OCFS2_FS_SECURITY
+	&ocfs2_xattr_security_handler,
+#endif
+	NULL
+};
+
+static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+					= &ocfs2_xattr_acl_access_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+					= &ocfs2_xattr_acl_default_handler,
+#endif
+	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+#ifdef CONFIG_OCFS2_FS_LUSTRE
+	[OCFS2_XATTR_INDEX_LUSTRE]	= &ocfs2_xattr_lustre_handler,
+#endif
+#ifdef CONFIG_OCFS2_FS_SECURITY
+	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
+#endif
+};
+
+struct ocfs2_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ocfs2_xattr_search {
+	struct buffer_head *inode_bh;
+	struct buffer_head *xattr_bh;
+	struct ocfs2_xattr_header *header;
+	void *base;
+	void *end;
+	struct ocfs2_xattr_entry *here;
+	int not_found;
+};
+
+static inline u32 ocfs2_blocks_per_cluster(struct super_block *sb)
+{
+	return 1 << (OCFS2_SB(sb)->s_clustersize_bits -
sb->s_blocksize_bits);
+}
+
+static inline struct xattr_handler *ocfs2_xattr_handler(int name_index)
+{
+	struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+		handler = ocfs2_xattr_handler_map[name_index];
+
+	return handler;
+}
+
+static inline __u32 ocfs2_xattr_name_hash(char *prefix,
+					  int prefix_len,
+					  char *name,
+					  int name_len)
+{
+	__u32 hash = 0;
+	int i;
+
+	for (i = 0; i < prefix_len; i++) {
+		hash = (hash << OCFS2_NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_NAME_HASH_SHIFT)) ^
+		       *prefix++;
+	}
+
+	for (i = 0; i < name_len; i++) {
+		hash = (hash << OCFS2_NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+	return hash;
+}
+
+/*
+ * ocfs2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static void ocfs2_xattr_hash_entry(struct ocfs2_xattr_header *header,
+				   struct ocfs2_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	struct xattr_handler *handler = ocfs2_xattr_handler(entry->xe_type);
+	char *prefix = handler->prefix;
+	char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+	int prefix_len = strlen(handler->prefix);
+
+	hash = ocfs2_xattr_name_hash(prefix, prefix_len, name,
+				     entry->xe_name_len);
+	entry->xe_name_hash = cpu_to_le32(hash);
+
+	return;
+}
+
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 u32 clusters_to_add,
 					 struct buffer_head *xattr_bh,
@@ -50,7 +187,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_list *root_el = &xv->xr_list;
-	u32 logical_start = le16_to_cpu(xv->xr_clusters);
+	u32 logical_start = le32_to_cpu(xv->xr_clusters);
 
 	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
 
@@ -155,7 +292,9 @@ leave:
 static int __ocfs2_remove_xattr_range(struct inode *inode,
 				      struct buffer_head *root_bh,
 				      struct ocfs2_xattr_value_root *xv,
-				      u32 cpos, u32 phys_cpos, u32 len,
+				      u32 cpos,
+				      u32 phys_cpos,
+				      u32 len,
 				      struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
 	int ret;
@@ -184,9 +323,8 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		goto out;
 	}
 
@@ -194,7 +332,7 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out_commit;
 	}
 
 	ret = ocfs2_remove_extent(inode, root_bh, cpos, len, handle, meta_ac,
@@ -299,3 +437,1195 @@ static int ocfs2_xattr_value_truncate(struct inode
*inode,
 
 	return ret;
 }
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+				    struct ocfs2_xattr_header *header,
+				    char *buffer, size_t buffer_size)
+{
+	size_t rest = buffer_size;
+	struct ocfs2_xattr_entry *entry;
+	struct xattr_handler *handler = NULL;
+	int i;
+
+	entry = header->xh_entries;
+	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+		handler = ocfs2_xattr_handler(entry->xe_type);
+
+		if (handler) {
+			size_t size = handler->list(inode, buffer, rest,
+					((char *)header +
+					le16_to_cpu(entry->xe_name_offset)),
+					entry->xe_name_len);
+			if (buffer) {
+				if (size > rest)
+					return -ERANGE;
+				buffer += size;
+			}
+			rest -= size;
+		}
+		entry += 1;
+	}
+
+	return buffer_size - rest;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct ocfs2_xattr_header *header = NULL;
+	int ret = 0;
+
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL))
+		return ret;
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 OCFS2_MAX_XATTR_INLINE_SIZE);
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_header *header = NULL;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+	else {
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(di->i_xattr_loc),
+				       &blk_bh, OCFS2_BH_CACHED, inode);
+		if (ret)
+			return ret;
+	}
+
+	header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
+		 xb_attrs.xb_header;
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+	if (blk_bh)
+		brelse(blk_bh);
+	return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+			char *buffer,
+			size_t size)
+{
+	int ret, i_ret, b_ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+
+	ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_HAS_XATTR_FL))
+		return 0;
+
+	i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+	if (i_ret < 0)
+		b_ret = 0;
+	else {
+		if (buffer) {
+			buffer += i_ret;
+			size -= i_ret;
+		}
+		b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+					       buffer, size);
+		if (b_ret < 0)
+			i_ret = 0;
+	}
+	ocfs2_inode_unlock(dentry->d_inode, 0);
+
+	if (di_bh)
+		brelse(di_bh);
+
+	return (i_ret + b_ret);
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *entry;
+	size_t name_len;
+	int i, cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	entry = xs->here;
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		cmp = name_index - entry->xe_type;
+		if (!cmp)
+			cmp = name_len - entry->xe_name_len;
+		if (!cmp)
+			cmp = memcmp(name, (xs->base +
+				     le16_to_cpu(entry->xe_name_offset)),
+				     name_len);
+		if (cmp == 0)
+			break;
+		entry += 1;
+	}
+	xs->here = entry;
+
+	return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_search *xs,
+					 void *buffer,
+					 size_t len)
+{
+	u32 cpos, p_cluster, num_clusters, bpc, clusters;
+	u64 blkno;
+	int i, ret = 0;
+	size_t cplen, blocksize;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_extent_list *el;
+
+	xv = (struct ocfs2_xattr_value_root *)
+		(xs->base + le16_to_cpu(xs->here->xe_name_offset) +
+		OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+	el = &xv->xr_list;
+	clusters = le32_to_cpu(xv->xr_clusters);
+	bpc = ocfs2_blocks_per_cluster(inode->i_sb);
+	blocksize = inode->i_sb->s_blocksize;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		/* copy ocfs2_xattr_value */
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+					       &bh, OCFS2_BH_CACHED, inode);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cplen = len >= blocksize ? blocksize : len;
+			memcpy(buffer, bh->b_data, cplen);
+			len -= cplen;
+			buffer += cplen;
+
+			brelse(bh);
+			bh = NULL;
+			if (len == 0)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	size_t size;
+	int ret = 0;
+
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL))
+		return -ENODATA;
+
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - OCFS2_MAX_XATTR_INLINE_SIZE);
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		return ret;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		if (size > buffer_size)
+			return -ERANGE;
+		if (xs->here->xe_local) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			ret = ocfs2_xattr_get_value_outside(inode, xs,
+							    buffer, size);
+			if (ret)
+				return ret;
+		}
+	}
+	return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	size_t size;
+	int ret = -ENODATA;
+
+	if (!di->i_xattr_loc)
+		return ret;
+	else {
+		struct ocfs2_xattr_block *xb;
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(di->i_xattr_loc),
+				       &blk_bh, OCFS2_BH_CACHED, inode);
+		if (ret)
+			goto cleanup;
+		xs->xattr_bh = blk_bh;
+		xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+	}
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		goto cleanup;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		ret = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		if (xs->here->xe_local) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			ret = ocfs2_xattr_get_value_outside(inode, xs,
+							    buffer, size);
+			if (ret)
+				goto cleanup;
+		}
+	}
+	ret = size;
+cleanup:
+	if (blk_bh)
+		brelse(blk_bh);
+	return ret;
+}
+
+int ocfs2_xattr_get(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    void *buffer,
+		    size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_HAS_XATTR_FL)) {
+		ret = -ENODATA;
+		goto cleanup;
+	}
+
+	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+				    buffer_size, &xis);
+	if (ret == -ENODATA)
+		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+					    buffer_size, &xbs);
+cleanup:
+	ocfs2_inode_unlock(inode, 0);
+
+	if (di_bh)
+		brelse(di_bh);
+	return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   struct ocfs2_xattr_value_root *xv,
+					   const void *value,
+					   int value_len)
+{
+	int ret = 0, i, cp_len, credits;
+	u16 blocksize = inode->i_sb->s_blocksize;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	handle_t *handle;
+
+	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+	credits = clusters * bpc;
+	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+					       &bh, OCFS2_BH_CACHED, inode);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			ret = ocfs2_journal_access(handle,
+						   inode,
+						   bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			cp_len = value_len > blocksize ? blocksize : value_len;
+			memcpy(bh->b_data, value, cp_len);
+			value_len -= cp_len;
+			value += cp_len;
+			if (cp_len < blocksize)
+				memset(bh->b_data + cp_len, 0,
+				       blocksize - cp_len);
+
+			ocfs2_journal_dirty(handle, bh);
+			brelse(bh);
+			bh = NULL;
+
+			/*
+			 * XXX: do we need to empty all the following
+			 * blocks in this cluster?
+			 */
+			if (!value_len)
+				break;
+		}
+
+		cpos += num_clusters;
+	}
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	if (bh)
+		brelse(bh);
+	return ret;
+}
+
+static inline int ocfs2_xattr_update_entry(struct inode *inode,
+					   struct ocfs2_xattr_info *xi,
+					   struct ocfs2_xattr_search *xs,
+					   size_t offs)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xs->here->xe_name_offset = cpu_to_le16(offs);
+	xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+	if (le64_to_cpu(xs->here->xe_value_size) <= OCFS2_XATTR_INLINE_SIZE)
+		xs->here->xe_local = 1;
+	else
+		xs->here->xe_local = 0;
+	ocfs2_xattr_hash_entry(xs->header, xs->here);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_set_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 size_t offs)
+{
+	size_t name_len = strlen(xi->name);
+	void *val = xs->base + offs;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	int ret = 0;
+
+	/* Clear bytes. */
+	memset(val, 0, size);
+	memcpy(val, xi->name, name_len);
+	xv = (struct ocfs2_xattr_value_root *)
+		(val + OCFS2_XATTR_SIZE(name_len));
+	xv->xr_clusters = 0;
+	xv->xr_last_eb_blk = 0;
+	xv->xr_list.l_tree_depth = 0;
+	xv->xr_list.l_count = cpu_to_le16(1);
+	xv->xr_list.l_next_free_rec = 0;
+
+	ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+					 xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+					      xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+
+	return ret;
+}
+
+
+
+static void ocfs2_xattr_set_entry_local(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_entry *last,
+					size_t min_offs)
+{
+	size_t name_len = strlen(xi->name);
+	int i;
+
+	if (xi->value && xs->not_found) {
+		/* Insert the new xattr entry. */
+		le16_add_cpu(&xs->header->xh_count, 1);
+		last->xe_type = xi->name_index;
+		if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+			last->xe_local = 1;
+		else
+			last->xe_local = 0;
+		last->xe_name_len = name_len;
+	} else {
+		void *first_val;
+		void *val;
+		size_t offs, size;
+
+		first_val = xs->base + min_offs;
+		offs = le16_to_cpu(xs->here->xe_name_offset);
+		val = xs->base + offs;
+
+		if (le64_to_cpu(xs->here->xe_value_size) >
+		    OCFS2_XATTR_INLINE_SIZE)
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+
+		if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len)) {
+			/* The old and the new value have the
+			   same size. Just replace the value. */
+			xs->here->xe_local = 1;
+			xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+			/* Clear value bytes. */
+			memset(val + OCFS2_XATTR_SIZE(name_len),
+			       0,
+			       OCFS2_XATTR_SIZE(xi->value_len));
+			memcpy(val + OCFS2_XATTR_SIZE(name_len),
+			       xi->value,
+			       xi->value_len);
+			return;
+		}
+		/* Remove the old name+value. */
+		memmove(first_val + size, first_val, val - first_val);
+		memset(first_val, 0, size);
+		xs->here->xe_name_hash = 0;
+		xs->here->xe_name_offset = 0;
+		xs->here->xe_local = 0;
+		xs->here->xe_value_size = 0;
+
+		min_offs += size;
+
+		/* Adjust all value offsets. */
+		last = xs->header->xh_entries;
+		for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+			size_t o = le16_to_cpu(last->xe_name_offset);
+			if (last->xe_value_size && o < offs)
+				last->xe_name_offset = cpu_to_le16(o + size);
+			last += 1;
+		}
+
+		if (!xi->value) {
+			/* Remove the old entry. */
+			last -= 1;
+			memmove(xs->here, xs->here + 1,
+				(void *)last - (void *)xs->here);
+			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+			le16_add_cpu(&xs->header->xh_count, -1);
+		}
+	}
+	if (xi->value) {
+		/* Insert the new name+value. */
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len);
+		void *val = xs->base + min_offs - size;
+		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+		/* Clear bytes. */
+		memset(val, 0, size);
+		memcpy(val, xi->name, name_len);
+		memcpy(val + OCFS2_XATTR_SIZE(name_len),
+		       xi->value,
+		       xi->value_len);
+		xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+		xs->here->xe_local = 1;
+		ocfs2_xattr_hash_entry(xs->header, xs->here);
+	}
+	return;
+}
+
+
+static int ocfs2_xattr_set_entry(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *last;
+	size_t free, min_offs = xs->end - xs->base, name_len =
strlen(xi->name);
+	size_t size_l = 0;
+	handle_t *handle = NULL;
+	int i, ret;
+	struct ocfs2_xattr_info xi_l = {
+		.name_index = xi->name_index,
+		.name = xi->name,
+		.value = xi->value,
+		.value_len = xi->value_len,
+	};
+
+	/* Compute min_offs and last. */
+	last = xs->header->xh_entries;
+
+	for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+
+	if (!xs->not_found) {
+		size_t size = 0;
+		if (xs->here->xe_local)
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		free += (size + sizeof(struct ocfs2_xattr_entry));
+	}
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_ROOT_SIZE) {
+			ret = -ENOSPC;
+			goto out;
+		}
+		size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+		xi_l.value = (void *)&def_xv;
+		xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+	} else if (xi->value) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	if (!xs->not_found) {
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		void *val = xs->base + offs;
+
+		if (xs->here->xe_local && size == size_l) {
+			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+							    offs);
+			goto out;
+		} else if (!xs->here->xe_local) {
+			struct ocfs2_xattr_value_root *xv = NULL;
+			xv = (struct ocfs2_xattr_value_root *)(val +
+				OCFS2_XATTR_SIZE(name_len));
+
+			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+				ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 xi->value_len);
+				if (ret < 0)
+					goto out;
+
+				ret = __ocfs2_xattr_set_value_outside(inode,
+								xv,
+								xi->value,
+								xi->value_len);
+				if (ret < 0)
+					goto out;
+
+				ret = ocfs2_xattr_update_entry(inode,
+							       xi,
+							       xs,
+							       offs);
+				goto out;
+			} else
+				ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 0);
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
+	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+	}
+
+out:
+	mlog_exit(ret);
+	return ret;
+
+}
+
+static int ocfs2_xattr_free_block(handle_t *handle,
+				  struct ocfs2_super *osb,
+				  struct ocfs2_xattr_block *xb)
+{
+	struct inode *xb_alloc_inode;
+	struct buffer_head *xb_alloc_bh = NULL;
+	u64 blk = le64_to_cpu(xb->xb_blkno);
+	u16 bit = le16_to_cpu(xb->xb_suballoc_bit);
+	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+	int ret = 0;
+
+	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+				EXTENT_ALLOC_SYSTEM_INODE,
+				le16_to_cpu(xb->xb_suballoc_slot));
+	if (!xb_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&xb_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+	ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+out_unlock:
+	ocfs2_inode_unlock(xb_alloc_inode, 1);
+	if (xb_alloc_bh)
+		brelse(xb_alloc_bh);
+out_mutex:
+	mutex_unlock(&xb_alloc_inode->i_mutex);
+	iput(xb_alloc_inode);
+out:
+	return ret;
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *header)
+{
+	struct ocfs2_xattr_entry *entry;
+	struct ocfs2_xattr_value_root *xv;
+	void *val;
+	int ret = 0, i;
+
+	entry = header->xh_entries;
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		if (!entry->xe_local) {
+			val = (void *)header +
+				le16_to_cpu(entry->xe_name_offset);
+			xv = (struct ocfs2_xattr_value_root *)(val +
+				OCFS2_XATTR_SIZE(entry->xe_name_len));
+			ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+		}
+	}
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+				    struct buffer_head *di_bh)
+{
+
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_xattr_header *header;
+	int ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 OCFS2_MAX_XATTR_INLINE_SIZE);
+	ret = ocfs2_remove_value_outside(inode, di_bh, header);
+	return ret;
+}
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+				    struct buffer_head *blk_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_header *header;
+	int ret = 0;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	header = &(xb->xb_attrs.xb_header);
+
+	ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	int ret;
+
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_HAS_XATTR_FL))
+		return 0;
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+	if (di->i_xattr_loc) {
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(di->i_xattr_loc),
+				       &blk_bh, OCFS2_BH_CACHED, inode);
+		if (ret)
+			return ret;
+		ret = ocfs2_xattr_block_remove(inode, blk_bh);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (di->i_xattr_loc) {
+		xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+		ocfs2_xattr_free_block(handle, osb, xb);
+		di->i_xattr_loc = cpu_to_le64(0);
+	}
+
+	di->i_dyn_features = cpu_to_le16(le16_to_cpu(di->i_dyn_features) &
+					 !OCFS2_INLINE_XATTR_FL &
+					 !OCFS2_HAS_XATTR_FL);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	if (blk_bh)
+		brelse(blk_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_update_flag(struct inode *inode,
+				   struct buffer_head *di_bh,
+				   int flag)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	di->i_dyn_features = cpu_to_le16(le16_to_cpu(di->i_dyn_features) |
+					 flag);
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return 0;
+
+	xs->xattr_bh = xs->inode_bh;
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - OCFS2_MAX_XATTR_INLINE_SIZE);
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	/* Find the named attribute. */
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+		if (ret && ret != -ENODATA)
+			return ret;
+		xs->not_found = ret;
+	}
+	return 0;
+}
+
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return -ENOSPC;
+	ret = ocfs2_xattr_set_entry(inode, xi, xs);
+	if (!ret && !(le16_to_cpu(di->i_dyn_features) &
OCFS2_INLINE_XATTR_FL))
+		ret = ocfs2_xattr_update_flag(inode,
+					      xs->inode_bh,
+					      (OCFS2_INLINE_XATTR_FL |
+					      OCFS2_HAS_XATTR_FL));
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	int ret = 0;
+
+	if (di->i_xattr_loc) {
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(di->i_xattr_loc),
+				       &blk_bh, OCFS2_BH_CACHED, inode);
+		if (ret)
+			return ret;
+		xs->xattr_bh = blk_bh;
+		xs->header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
+				xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+		if (ret && ret != -ENODATA)
+			goto cleanup;
+		xs->not_found = ret;
+		return 0;
+	}
+cleanup:
+	if (blk_bh)
+		brelse(blk_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_block_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_xattr_block *xblk = NULL;
+	int credits;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	int ret;
+
+	if (!xs->xattr_bh) {
+		ret = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
+		if (ret < 0)
+			goto out;
+
+		credits = ocfs2_calc_extend_credits(osb->sb, el, 1);
+		handle = ocfs2_start_trans(osb, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+					   &suballoc_bit_start, &num_got,
+					   &first_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		new_bh = sb_getblk(inode->i_sb, first_blkno);
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/* set ocfs2_xattr_block */
+		xs->xattr_bh = new_bh;
+		xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+		memset(xblk, 0, inode->i_sb->s_blocksize);
+		strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE_1);
+		xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+		xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+		xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+		xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+		xs->header = &xblk->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+		xs->here = xs->header->xh_entries;
+
+
+		ret = ocfs2_journal_dirty(handle, new_bh);
+		if (ret < 0)
+			goto out_commit;
+		di->i_xattr_loc = cpu_to_le64(first_blkno);
+		ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+out_commit:
+		ocfs2_commit_trans(osb, handle);
+out:
+		if (meta_ac)
+			ocfs2_free_alloc_context(meta_ac);
+		if (ret < 0)
+			return ret;
+	}
+	ret = ocfs2_xattr_set_entry(inode, xi, xs);
+	if (!ret && !(le16_to_cpu(di->i_dyn_features) &
OCFS2_HAS_XATTR_FL))
+		ocfs2_xattr_update_flag(inode,
+					xs->inode_bh,
+					OCFS2_HAS_XATTR_FL);
+
+	return ret;
+}
+
+int ocfs2_xattr_set(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    const void *value,
+		    size_t value_len,
+		    int flags)
+{
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int ret;
+
+	struct ocfs2_xattr_info xi = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	if (xis.not_found && xbs.not_found) {
+		ret = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		ret = 0;
+		if (!value)
+		goto cleanup;
+	} else {
+		ret = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+
+	if (!value) {
+		if (!xis.not_found)
+			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		else if (!xbs.not_found)
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+	} else {
+		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		if (!ret && !xbs.not_found) {
+			xi.value = NULL;
+			xi.value_len = 0;
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+		} else if (ret == -ENOSPC) {
+			if (di->i_xattr_loc && !xbs.xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode, name_index,
+							     name, &xbs);
+				if (ret)
+					goto cleanup;
+			}
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+			if (ret)
+				goto cleanup;
+			if (!xis.not_found) {
+				xi.value = NULL;
+				xi.value_len = 0;
+				ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+			}
+		}
+	}
+cleanup:
+	ocfs2_inode_unlock(inode, 1);
+	if (di_bh)
+		brelse(di_bh);
+	if (xbs.xattr_bh)
+		brelse(xbs.xattr_bh);
+	return ret;
+}
+
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index debde62..0732a00 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -77,6 +77,11 @@ struct ocfs2_xattr_tree_root {
 /*10*/	struct ocfs2_extent_list	xt_list;
 };
 
+struct ocfs2_xattr_def_value_root {
+	struct ocfs2_xattr_value_root	xv;
+	struct ocfs2_extent_rec		er;
+};
+
 #define OCFS2_XATTR_INDEXED 0x1
 
 struct ocfs2_xattr_block {
@@ -95,4 +100,26 @@ struct ocfs2_xattr_block {
 	} xb_attrs;
 };
 
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
+#ifdef CONFIG_OCFS2_FS_LUSTRE
+extern struct xattr_handler ocfs2_xattr_lustre_handler;
+#endif
+#ifdef CONFIG_OCFS2_FS_SECURITY
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#endif
+
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+			   size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/ocfs2/xattr_trusted.c b/fs/ocfs2/xattr_trusted.c
new file mode 100644
index 0000000..79f70d3
--- /dev/null
+++ b/fs/ocfs2/xattr_trusted.c
@@ -0,0 +1,79 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr_trusted.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ocfs2_xattr_trusted_list,
+	.get	= ocfs2_xattr_trusted_get,
+	.set	= ocfs2_xattr_trusted_set,
+};
diff --git a/fs/ocfs2/xattr_user.c b/fs/ocfs2/xattr_user.c
new file mode 100644
index 0000000..ef37003
--- /dev/null
+++ b/fs/ocfs2/xattr_user.c
@@ -0,0 +1,91 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr_user.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+				const void *value, size_t size, int flags)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ocfs2_xattr_user_list,
+	.get	= ocfs2_xattr_user_get,
+	.set	= ocfs2_xattr_user_set,
+};
-- 
1.5.4.4

Tao Ma

2008-Jun-05 07:31 UTC

head link

[Ocfs2-devel] [PATCH 1/8] Modify ocfs2_num_free_extents for future xattr usage.v1

ocfs2_num_free_extents is used to find the free extent record
number. The old parameter is "ocfs2_dinode", and it isn't suitable
for xattr_value. So change it to "buffer_head *".
ocfs2_lock_allocator is also modified in this patch because we
need "buffer_head *" to call ocfs2_num_free_extents.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |    3 ++-
 fs/ocfs2/alloc.h |    2 +-
 fs/ocfs2/aops.c  |    5 +++--
 fs/ocfs2/dir.c   |    3 ++-
 fs/ocfs2/file.c  |   11 ++++++-----
 fs/ocfs2/file.h  |    2 +-
 6 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb46..c74711f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -368,12 +368,13 @@ struct ocfs2_merge_ctxt {
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe)
+			   struct buffer_head *bh)
 {
 	int retval;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)bh->b_data;
 
 	mlog_entry_void();
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94b..758dbda 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -47,7 +47,7 @@ int ocfs2_remove_extent(struct inode *inode, struct
buffer_head *di_bh,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe);
+			   struct buffer_head *bh);
 /* how many new metadata chunks would an allocation need at maximum? */
 static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 17964c0..6d933df 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1702,8 +1702,9 @@ int ocfs2_write_begin_nolock(struct address_space
*mapping,
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
-					    extents_to_split, &data_ac, &meta_ac);
+		ret = ocfs2_lock_allocators(inode, wc->w_di_bh,
+					    clusters_to_alloc, extents_to_split,
+					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a18758..8a14fff 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1474,7 +1474,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
-		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		num_free_extents = ocfs2_num_free_extents(osb, dir,
+							  parent_fe_bh);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 57e0d30..3993312 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -521,7 +521,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, inode, fe);
+	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -609,7 +609,7 @@ leave:
  * File systems which don't support holes call this from
  * ocfs2_extend_allocation().
  */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *di_bh,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac)
@@ -617,6 +617,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct
ocfs2_dinode *di,
 	int ret = 0, num_free_extents;
 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	*meta_ac = NULL;
 	if (data_ac)
@@ -629,7 +630,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct
ocfs2_dinode *di,
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long
long)i_size_read(inode),
 	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
 
-	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+	num_free_extents = ocfs2_num_free_extents(osb, inode, di_bh);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
@@ -724,7 +725,7 @@ static int __ocfs2_extend_allocation(struct inode *inode,
u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
+	status = ocfs2_lock_allocators(inode, bh, clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
 		mlog_errno(status);
@@ -1395,7 +1396,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
-	ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+	ret = ocfs2_lock_allocators(inode, di_bh, 0, 1, NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 048ddca..e38ecb2 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -55,7 +55,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *fe,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac);
-- 
1.5.4.GIT

Tao Ma

2008-Jun-05 07:32 UTC

head link

[Ocfs2-devel] [PATCH 2/8] Use ocfs2_extent_list instead of ocfs2_dinode.v1

ocfs2_extend_meta_needed, ocfs2_calc_extend_credits and
ocfs2_reserve_new_metadata are all useful for extent tree operations.
But they are all limited by using ocfs2_dinode as the parameter.
Change their parameter to ocfs2_extent_list so that xattr extent
list can use them.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c    |    3 ++-
 fs/ocfs2/alloc.h    |   12 +++++++++---
 fs/ocfs2/aops.c     |    3 ++-
 fs/ocfs2/dir.c      |    5 +++--
 fs/ocfs2/file.c     |    9 +++++----
 fs/ocfs2/journal.h  |   17 +++++++++++------
 fs/ocfs2/suballoc.c |    4 ++--
 fs/ocfs2/suballoc.h |    7 ++++++-
 8 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index c74711f..dc844df 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4536,7 +4536,8 @@ static int ocfs2_split_tree(struct inode *inode, struct
buffer_head *di_bh,
 	} else
 		rightmost_el = path_leaf_el(path);
 
-	credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+	credits += path->p_tree_depth +
+		   ocfs2_extend_meta_needed(&di->id2.i_list);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 758dbda..249e79e 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -48,8 +48,14 @@ int ocfs2_remove_extent(struct inode *inode, struct
buffer_head *di_bh,
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *bh);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list.
Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
 {
 	/*
 	 * Rather than do all the work of determining how much we need
@@ -59,7 +65,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode
*fe)
 	 * new tree_depth==0 extent_block, and one block at the new
 	 * top-of-the tree.
 	 */
-	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+	return le16_to_cpu(root_el->l_tree_depth) + 2;
 }
 
 void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode
*di);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 6d933df..f83a2a4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1710,7 +1710,8 @@ int ocfs2_write_begin_nolock(struct address_space
*mapping,
 			goto out;
 		}
 
-		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+		credits = ocfs2_calc_extend_credits(inode->i_sb,
+						    &di->id2.i_list,
 						    clusters_to_alloc);
 
 	}
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a14fff..5e8cd6d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1425,6 +1425,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	int credits, num_free_extents, drop_alloc_sem = 0;
 	loff_t dir_i_size;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
@@ -1483,7 +1484,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		}
 
 		if (!num_free_extents) {
-			status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+			status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
 			if (status < 0) {
 				if (status != -ENOSPC)
 					mlog_errno(status);
@@ -1498,7 +1499,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+		credits = ocfs2_calc_extend_credits(sb, el, 1);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3993312..79d7da9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -540,7 +540,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 		goto leave;
 	} else if ((!free_extents)
 		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(fe))) {
+		       < ocfs2_extend_meta_needed(&fe->id2.i_list))) {
 		mlog(0, "filesystem is really fragmented...\n");
 		status = -EAGAIN;
 		reason = RESTART_META;
@@ -652,7 +652,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct
buffer_head *di_bh,
 	 */
 	if (!num_free_extents ||
 	    (ocfs2_sparse_alloc(osb) && num_free_extents <
max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+		ret = ocfs2_reserve_new_metadata(osb, &di->id2.i_list, meta_ac);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
@@ -732,7 +732,8 @@ restart_all:
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -790,7 +791,7 @@ restarted_transaction:
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
-							    fe,
+							    &fe->id2.i_list,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2..f1479ab 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -339,11 +339,16 @@ int                  ocfs2_journal_dirty_data(handle_t
*handle,
 #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
 			     + OCFS2_UNLINK_CREDITS)
 
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list.
Otherwise
+ * the result may be wrong.
+ */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-					    struct ocfs2_dinode *fe,
+					    struct ocfs2_extent_list *root_el,
 					    u32 bits_wanted)
 {
-	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
 
 	/* bitmap dinode, group desc. + relinked group. */
 	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -354,16 +359,16 @@ static inline int ocfs2_calc_extend_credits(struct
super_block *sb,
 	 * however many metadata chunks needed * a remaining suballoc
 	 * alloc. */
 	sysfile_bitmap_blocks = 1 +
-		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
 
 	/* this does not include *new* metadata blocks, which are
-	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * accounted for in sysfile_bitmap_blocks. root_el +
 	 * prev. last_eb_blk + blocks along edge of tree.
 	 * calc_symlink_credits passes because we just need 1
 	 * credit for the dinode there. */
-	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
-	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278f..af769a5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -494,7 +494,7 @@ bail:
 }
 
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
+			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac)
 {
 	int status;
@@ -507,7 +507,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(root_el);
 	(*ac)->ac_which = OCFS2_AC_USE_META;
 	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600..d024c69 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -59,8 +59,13 @@ static inline int ocfs2_alloc_context_bits_left(struct
ocfs2_alloc_context *ac)
 	return ac->ac_bits_wanted - ac->ac_bits_given;
 }
 
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list.
Otherwise
+ * the result may be wrong.
+ */
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
+			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac);
-- 
1.5.4.GIT

Tao Ma

2008-Jun-05 07:32 UTC

head link

[Ocfs2-devel] [PATCH 3/8] Make ocfs2_lock_allocators generic for extent allocation.v1

Now ocfs2_lock_allocators is localized to be only used in
file extend allocation. But the whole function is useful
when we want to store large EAs. So make it generic.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c    |  460 ++++++++++++++++++++++++++++++++-------------------
 fs/ocfs2/alloc.h    |   23 ++-
 fs/ocfs2/aops.c     |   11 +-
 fs/ocfs2/dir.c      |    7 +-
 fs/ocfs2/file.c     |  104 ++----------
 fs/ocfs2/file.h     |    4 -
 fs/ocfs2/suballoc.c |   82 +++++++++
 fs/ocfs2/suballoc.h |    5 +
 8 files changed, 418 insertions(+), 278 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc844df..71a89b2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,94 @@
 
 #include "buffer_head_io.h"
 
+struct ocfs2_extent_tree_operations {
+	void (*set_last_eb_blk) (void *et, u64 blkno);
+	u64 (*get_last_eb_blk) (void *et);
+	int (*sanity_check) (struct inode *inode, void *et);
+};
+
+struct ocfs2_extent_tree {
+	enum ocfs2_extent_tree_type type;
+	struct ocfs2_extent_tree_operations *eops;
+	struct buffer_head *root_bh;
+	struct ocfs2_extent_list *root_el;
+};
+
+static void ocfs2_dinode_set_last_eb_blk(void *p, u64 blkno)
+{
+	struct ocfs2_extent_tree *et = (struct ocfs2_extent_tree *)p;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
+
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(void *p)
+{
+	struct ocfs2_extent_tree *et = (struct ocfs2_extent_tree *)p;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
+
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static int ocfs2_dinode_sanity_check(struct inode *inode, void *p)
+{
+	int ret = 0;
+	struct ocfs2_extent_tree *et = (struct ocfs2_extent_tree *)p;
+	struct ocfs2_dinode *di;
+
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+
+	di = (struct ocfs2_dinode *)et->root_bh->b_data;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ret = -EIO;
+		ocfs2_error(inode->i_sb,
+			"Inode %llu has invalid path root",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+
+	return ret;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+	.set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.sanity_check		= ocfs2_dinode_sanity_check,
+};
+
+static struct ocfs2_extent_tree*
+	 ocfs2_new_extent_tree(struct buffer_head *bh,
+			       enum ocfs2_extent_tree_type et_type)
+{
+	struct ocfs2_extent_tree *et;
+
+	et = kzalloc(sizeof(*et), GFP_NOFS);
+	if (!et)
+		return NULL;
+
+	et->type = et_type;
+	get_bh(bh);
+	et->root_bh = bh;
+
+	/* current we only support dinode extent. */
+	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
+	if (et_type == OCFS2_DINODE_EXTENT) {
+		et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
+		et->eops = &ocfs2_dinode_et_ops;
+	}
+
+	return et;
+}
+
+static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
+{
+	if (et) {
+		brelse(et->root_bh);
+		kfree(et);
+	}
+}
+
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt
*ctxt,
 					 struct ocfs2_extent_block *eb);
@@ -205,17 +293,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head
*root_bh,
 }
 
 /*
- * Allocate and initialize a new path based on a disk inode tree.
- */
-static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
-{
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_list *el = &di->id2.i_list;
-
-	return ocfs2_new_path(di_bh, el);
-}
-
-/*
  * Convenience function to journal all components in a path.
  */
 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,24 +445,33 @@ struct ocfs2_merge_ctxt {
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct buffer_head *bh)
+			   struct buffer_head *root_bh,
+			   enum ocfs2_extent_tree_type type)
 {
 	int retval;
-	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_list *el = NULL;
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
-	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)bh->b_data;
+	u64 last_eb_blk = 0;
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		retval = -EIO;
-		goto bail;
+	if (type == OCFS2_DINODE_EXTENT) {
+		struct ocfs2_dinode *fe +				(struct ocfs2_dinode *)root_bh->b_data;
+		if (!OCFS2_IS_VALID_DINODE(fe)) {
+			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+			retval = -EIO;
+			goto bail;
+		}
+
+		if (fe->i_last_eb_blk)
+			last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
+		el = &fe->id2.i_list;
 	}
 
-	if (fe->i_last_eb_blk) {
-		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+	if (last_eb_blk) {
+		retval = ocfs2_read_block(osb, last_eb_blk,
 					  &eb_bh, OCFS2_BH_CACHED, inode);
 		if (retval < 0) {
 			mlog_errno(retval);
@@ -393,8 +479,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 		}
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
-	} else
-		el = &fe->id2.i_list;
+	}
 
 	BUG_ON(el->l_tree_depth != 0);
 
@@ -532,7 +617,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct
ocfs2_extent_list  *el)
 static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct buffer_head *fe_bh,
+			    struct ocfs2_extent_tree *et,
 			    struct buffer_head *eb_bh,
 			    struct buffer_head **last_eb_bh,
 			    struct ocfs2_alloc_context *meta_ac)
@@ -541,7 +626,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	u64 next_blkno, new_last_eb_blk;
 	struct buffer_head *bh;
 	struct buffer_head **new_eb_bhs = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *eb_el;
 	struct ocfs2_extent_list  *el;
@@ -551,13 +635,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	BUG_ON(!last_eb_bh || !*last_eb_bh);
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
 	if (eb_bh) {
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
 	} else
-		el = &fe->id2.i_list;
+		el = et->root_el;
 
 	/* we never add a branch to a leaf. */
 	BUG_ON(!el->l_tree_depth);
@@ -647,7 +729,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -663,7 +745,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	}
 
 	/* Link the new branch into the rest of the tree (el will
-	 * either be on the fe, or the extent block passed in. */
+	 * either be on the root_bh, or the extent block passed in. */
 	i = le16_to_cpu(el->l_next_free_rec);
 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -672,7 +754,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	/* fe needs a new last extent block pointer, as does the
 	 * next_leaf on the previously last-extent-block. */
-	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+	et->eops->set_last_eb_blk(et, new_last_eb_blk);
 
 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -680,7 +762,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
 	if (status < 0)
 		mlog_errno(status);
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->root_bh);
 	if (status < 0)
 		mlog_errno(status);
 	if (eb_bh) {
@@ -718,16 +800,15 @@ bail:
 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 				  handle_t *handle,
 				  struct inode *inode,
-				  struct buffer_head *fe_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_alloc_context *meta_ac,
 				  struct buffer_head **ret_new_eb_bh)
 {
 	int status, i;
 	u32 new_clusters;
 	struct buffer_head *new_eb_bh = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *root_el;
 	struct ocfs2_extent_list  *eb_el;
 
 	mlog_entry_void();
@@ -747,8 +828,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	}
 
 	eb_el = &eb->h_list;
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	fe_el = &fe->id2.i_list;
+	root_el = et->root_el;
 
 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -757,11 +837,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	/* copy the fe data into the new extent block */
-	eb_el->l_tree_depth = fe_el->l_tree_depth;
-	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		eb_el->l_recs[i] = fe_el->l_recs[i];
+	/* copy the root extent list data into the new extent block */
+	eb_el->l_tree_depth = root_el->l_tree_depth;
+	eb_el->l_next_free_rec = root_el->l_next_free_rec;
+	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = root_el->l_recs[i];
 
 	status = ocfs2_journal_dirty(handle, new_eb_bh);
 	if (status < 0) {
@@ -769,7 +849,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -778,21 +858,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 
 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
 
-	/* update fe now */
-	le16_add_cpu(&fe_el->l_tree_depth, 1);
-	fe_el->l_recs[0].e_cpos = 0;
-	fe_el->l_recs[0].e_blkno = eb->h_blkno;
-	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-	fe_el->l_next_free_rec = cpu_to_le16(1);
+	/* update root_bh now */
+	le16_add_cpu(&root_el->l_tree_depth, 1);
+	root_el->l_recs[0].e_cpos = 0;
+	root_el->l_recs[0].e_blkno = eb->h_blkno;
+	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	root_el->l_next_free_rec = cpu_to_le16(1);
 
 	/* If this is our 1st tree depth shift, then last_eb_blk
 	 * becomes the allocated extent block */
-	if (fe_el->l_tree_depth == cpu_to_le16(1))
-		fe->i_last_eb_blk = eb->h_blkno;
+	if (root_el->l_tree_depth == cpu_to_le16(1))
+		et->eops->set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->root_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -818,22 +898,21 @@ bail:
  * 1) a lowest extent block is found, then we pass it back in
  *    *lowest_eb_bh and return '0'
  *
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
  *    pass NULL back in *lowest_eb_bh, but still return '0'
  *
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
  *    which case we return > 0
  *
  * return status < 0 indicates an error.
  */
 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 				    struct inode *inode,
-				    struct buffer_head *fe_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **target_bh)
 {
 	int status = 0, i;
 	u64 blkno;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *el;
 	struct buffer_head *bh = NULL;
@@ -843,8 +922,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	*target_bh = NULL;
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	el = &fe->id2.i_list;
+	el = et->root_el;
 
 	while(le16_to_cpu(el->l_tree_depth) > 1) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -896,8 +974,8 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	/* If we didn't find one and the fe doesn't have any room,
 	 * then return '1' */
-	if (!lowest_bh
-	    && (fe->id2.i_list.l_next_free_rec ==
fe->id2.i_list.l_count))
+	el = et->root_el;
+	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
 		status = 1;
 
 	*target_bh = lowest_bh;
@@ -920,19 +998,19 @@ bail:
  * *last_eb_bh will be updated by ocfs2_add_branch().
  */
 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
-			   struct buffer_head *di_bh, int *final_depth,
+			   struct ocfs2_extent_tree *et, int *final_depth,
 			   struct buffer_head **last_eb_bh,
 			   struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, shift;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+	struct ocfs2_extent_list *el = et->root_el;
+	int depth = le16_to_cpu(el->l_tree_depth);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *bh = NULL;
 
 	BUG_ON(meta_ac == NULL);
 
-	shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
 	if (shift < 0) {
 		ret = shift;
 		mlog_errno(ret);
@@ -949,7 +1027,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t
*handle,
 		/* ocfs2_shift_tree_depth will return us a buffer with
 		 * the new extent block (so we can pass that to
 		 * ocfs2_add_branch). */
-		ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
 					     meta_ac, &bh);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -976,7 +1054,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t
*handle,
 	/* call ocfs2_add_branch to add the final part of the tree with
 	 * the new data. */
 	mlog(0, "add branch. bh = %p\n", bh);
-	ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
 			       meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2068,11 +2146,11 @@ static int ocfs2_rotate_subtree_left(struct inode
*inode, handle_t *handle,
 				     struct ocfs2_path *right_path,
 				     int subtree_index,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     int *deleted)
+				     int *deleted,
+				     struct ocfs2_extent_tree *et)
 {
 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
-	struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
 	struct ocfs2_extent_block *eb;
 
@@ -2124,7 +2202,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode,
handle_t *handle,
 		 * We have to update i_last_eb_blk during the meta
 		 * data delete.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et_root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -2199,7 +2277,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode,
handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		et->eops->set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
 		/*
 		 * Removal of the extent in the left leaf was skipped
@@ -2209,7 +2287,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode,
handle_t *handle,
 		if (right_has_empty)
 			ocfs2_remove_empty_extent(left_leaf_el);
 
-		ret = ocfs2_journal_dirty(handle, di_bh);
+		ret = ocfs2_journal_dirty(handle, et_root_bh);
 		if (ret)
 			mlog_errno(ret);
 
@@ -2332,7 +2410,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 				    handle_t *handle, int orig_credits,
 				    struct ocfs2_path *path,
 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
-				    struct ocfs2_path **empty_extent_path)
+				    struct ocfs2_path **empty_extent_path,
+				    struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_root, deleted;
 	u32 right_cpos;
@@ -2405,7 +2484,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 
 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
 						right_path, subtree_root,
-						dealloc, &deleted);
+						dealloc, &deleted, et);
 		if (ret == -EAGAIN) {
 			/*
 			 * The rotation has to temporarily stop due to
@@ -2448,29 +2527,20 @@ out:
 }
 
 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
-				       struct ocfs2_path *path,
-				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+				struct ocfs2_path *path,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_index;
 	u32 cpos;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	/*
-	 * XXX: This code assumes that the root is an inode, which is
-	 * true for now but may change as tree code gets generic.
-	 */
-	di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		ret = -EIO;
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has invalid path root",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		goto out;
-	}
 
+	ret = et->eops->sanity_check(inode, et);
+	if (ret)
+		goto out;
 	/*
 	 * There's two ways we handle this depending on
 	 * whether path is the only existing one.
@@ -2527,7 +2597,7 @@ static int ocfs2_remove_rightmost_path(struct inode
*inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		et->eops->set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 	} else {
 		/*
 		 * 'path' is also the leftmost path which
@@ -2538,12 +2608,12 @@ static int ocfs2_remove_rightmost_path(struct inode
*inode, handle_t *handle,
 		 */
 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
 
-		el = &di->id2.i_list;
+		el = et->root_el;
 		el->l_tree_depth = 0;
 		el->l_next_free_rec = 0;
 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
 
-		di->i_last_eb_blk = 0;
+		et->eops->set_last_eb_blk(et, 0);
 	}
 
 	ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2571,7 +2641,8 @@ out:
  */
 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct ocfs2_cached_dealloc_ctxt *dealloc)
+				  struct ocfs2_cached_dealloc_ctxt *dealloc,
+				  struct ocfs2_extent_tree *et)
 {
 	int ret, orig_credits = handle->h_buffer_credits;
 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2585,7 +2656,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode,
handle_t *handle,
 	if (path->p_tree_depth == 0) {
 rightmost_no_delete:
 		/*
-		 * In-inode extents. This is trivially handled, so do
+		 * Inline extents. This is trivially handled, so do
 		 * it up front.
 		 */
 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2639,7 +2710,7 @@ rightmost_no_delete:
 		 */
 
 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
-						  dealloc);
+						  dealloc, et);
 		if (ret)
 			mlog_errno(ret);
 		goto out;
@@ -2651,7 +2722,7 @@ rightmost_no_delete:
 	 */
 try_rotate:
 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
-				       dealloc, &restart_path);
+				       dealloc, &restart_path, et);
 	if (ret && ret != -EAGAIN) {
 		mlog_errno(ret);
 		goto out;
@@ -2663,7 +2734,7 @@ try_rotate:
 
 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
 					       tmp_path, dealloc,
-					       &restart_path);
+					       &restart_path, et);
 		if (ret && ret != -EAGAIN) {
 			mlog_errno(ret);
 			goto out;
@@ -2949,6 +3020,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 				handle_t *handle,
 				struct ocfs2_extent_rec *split_rec,
 				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et,
 				int index)
 {
 	int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3069,7 +3141,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 		    le16_to_cpu(el->l_next_free_rec) == 1) {
 
 			ret = ocfs2_remove_rightmost_path(inode, handle,
-							  right_path, dealloc);
+							  right_path,
+							  dealloc, et);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3096,7 +3169,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 				     int split_index,
 				     struct ocfs2_extent_rec *split_rec,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     struct ocfs2_merge_ctxt *ctxt)
+				     struct ocfs2_merge_ctxt *ctxt,
+				     struct ocfs2_extent_tree *et)
 
 {
 	int ret = 0;
@@ -3114,7 +3188,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 * illegal.
 		 */
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3157,7 +3231,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
 
 		/* The merge left us with an empty extent, remove it. */
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path,
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3171,7 +3246,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 */
 		ret = ocfs2_merge_rec_left(inode, path,
 					   handle, rec,
-					   dealloc,
+					   dealloc, et,
 					   split_index);
 
 		if (ret) {
@@ -3180,7 +3255,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		}
 
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		/*
 		 * Error from this last rotate is not critical, so
 		 * print but don't bubble it up.
@@ -3200,7 +3275,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			ret = ocfs2_merge_rec_left(inode,
 						   path,
 						   handle, split_rec,
-						   dealloc,
+						   dealloc, et,
 						   split_index);
 			if (ret) {
 				mlog_errno(ret);
@@ -3223,7 +3298,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			 * our leaf. Try to rotate it away.
 			 */
 			ret = ocfs2_rotate_tree_left(inode, handle, path,
-						     dealloc);
+						     dealloc, et);
 			if (ret)
 				mlog_errno(ret);
 			ret = 0;
@@ -3367,6 +3442,17 @@ static inline void ocfs2_update_dinode_clusters(struct
inode *inode,
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
 
+static void ocfs2_update_clusters(struct inode *inode,
+				  struct ocfs2_extent_tree *et,
+				  u32 clusters)
+{
+	if (et->type == OCFS2_DINODE_EXTENT) {
+		struct ocfs2_dinode *di +			(struct ocfs2_dinode *)et->root_bh->b_data;
+		ocfs2_update_dinode_clusters(inode, di, clusters);
+	}
+}
+
 static void ocfs2_adjust_rightmost_records(struct inode *inode,
 					   handle_t *handle,
 					   struct ocfs2_path *path,
@@ -3568,8 +3654,8 @@ static void ocfs2_split_record(struct inode *inode,
 }
 
 /*
- * This function only does inserts on an allocation b-tree. For dinode
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
  *
  * right_path is the path we want to do the actual insert
  * in. left_path should only be passed in if we need to update that
@@ -3666,7 +3752,7 @@ out:
 
 static int ocfs2_do_insert_extent(struct inode *inode,
 				  handle_t *handle,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_extent_rec *insert_rec,
 				  struct ocfs2_insert_type *type)
 {
@@ -3674,13 +3760,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 	u32 cpos;
 	struct ocfs2_path *right_path = NULL;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_list *el;
 
-	di = (struct ocfs2_dinode *) di_bh->b_data;
-	el = &di->id2.i_list;
+	el = et->root_el;
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
+	ret = ocfs2_journal_access(handle, inode, et->root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -3692,7 +3776,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		goto out_update_clusters;
 	}
 
-	right_path = ocfs2_new_inode_path(di_bh);
+	right_path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3742,7 +3826,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		 * ocfs2_rotate_tree_right() might have extended the
 		 * transaction without re-journaling our tree root.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et->root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -3767,10 +3851,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
 	if (type->ins_split == SPLIT_NONE)
-		ocfs2_update_dinode_clusters(inode, di,
-					     le16_to_cpu(insert_rec->e_leaf_clusters));
+		ocfs2_update_clusters(inode, et,
+				      le16_to_cpu(insert_rec->e_leaf_clusters));
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
+	ret = ocfs2_journal_dirty(handle, et->root_bh);
 	if (ret)
 		mlog_errno(ret);
 
@@ -3924,8 +4008,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
  * ocfs2_figure_appending_type() will figure out whether we'll have to
  * insert at the tail of the rightmost leaf.
  *
- * This should also work against the dinode list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
  * then the logic here makes sense.
  */
 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3976,14 +4060,13 @@ set_tail_append:
  * structure.
  */
 static int ocfs2_figure_insert_type(struct inode *inode,
-				    struct buffer_head *di_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **last_eb_bh,
 				    struct ocfs2_extent_rec *insert_rec,
 				    int *free_records,
 				    struct ocfs2_insert_type *insert)
 {
 	int ret;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_path *path = NULL;
@@ -3991,7 +4074,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 
 	insert->ins_split = SPLIT_NONE;
 
-	el = &di->id2.i_list;
+	el = et->root_el;
 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
 
 	if (el->l_tree_depth) {
@@ -4002,7 +4085,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * may want it later.
 		 */
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk), &bh,
+				       et->eops->get_last_eb_blk(et), &bh,
 				       OCFS2_BH_CACHED, inode);
 		if (ret) {
 			mlog_exit(ret);
@@ -4029,7 +4112,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		return 0;
 	}
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4079,7 +4162,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * the case that we're doing a tail append, so maybe we can
 	 * take advantage of that information somehow.
 	 */
-	if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+	if (et->eops->get_last_eb_blk(et) =+	   
path_leaf_bh(path)->b_blocknr) {
 		/*
 		 * Ok, ocfs2_find_path() returned us the rightmost
 		 * tree path. This might be an appending insert. There are
@@ -4109,21 +4193,30 @@ out:
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct buffer_head *root_bh,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
 			u8 flags,
-			struct ocfs2_alloc_context *meta_ac)
+			struct ocfs2_alloc_context *meta_ac,
+			enum ocfs2_extent_tree_type et_type)
 {
 	int status;
 	int uninitialized_var(free_records);
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
+	struct ocfs2_extent_tree *et = NULL;
 
 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 
+	et = ocfs2_new_extent_tree(root_bh, et_type);
+	if (!et) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	mlog(0, "add %u clusters at position %u to inode %llu\n",
 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -4141,7 +4234,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 	rec.e_flags = flags;
 
-	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
 					  &free_records, &insert);
 	if (status < 0) {
 		mlog_errno(status);
@@ -4155,7 +4248,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	     free_records, insert.ins_tree_depth);
 
 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
-		status = ocfs2_grow_tree(inode, handle, fe_bh,
+		status = ocfs2_grow_tree(inode, handle, et,
 					 &insert.ins_tree_depth, &last_eb_bh,
 					 meta_ac);
 		if (status) {
@@ -4165,16 +4258,18 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	}
 
 	/* Finally, we can add clusters. This might rotate the tree for us. */
-	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
-	else
+	else if (et->type == OCFS2_DINODE_EXTENT)
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
 	if (last_eb_bh)
 		brelse(last_eb_bh);
 
+	if (et)
+		ocfs2_free_extent_tree(et);
 	mlog_exit(status);
 	return status;
 }
@@ -4202,7 +4297,7 @@ static void ocfs2_make_right_split_rec(struct super_block
*sb,
 static int ocfs2_split_and_insert(struct inode *inode,
 				  handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct buffer_head **last_eb_bh,
 				  int split_index,
 				  struct ocfs2_extent_rec *orig_split_rec,
@@ -4216,7 +4311,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
 	struct ocfs2_insert_type insert;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_dinode *di;
 
 leftright:
 	/*
@@ -4225,8 +4319,7 @@ leftright:
 	 */
 	rec = path_leaf_el(path)->l_recs[split_index];
 
-	di = (struct ocfs2_dinode *)di_bh->b_data;
-	rightmost_el = &di->id2.i_list;
+	rightmost_el = et->root_el;
 
 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
 	if (depth) {
@@ -4237,8 +4330,8 @@ leftright:
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) = 	   
le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
-				      meta_ac);
+		ret = ocfs2_grow_tree(inode, handle, et,
+				      &depth, last_eb_bh, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4275,8 +4368,7 @@ leftright:
 		do_leftright = 1;
 	}
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
-				     &insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4318,8 +4410,9 @@ out:
  * of the tree is required. All other cases will degrade into a less
  * optimal tree layout.
  *
- * last_eb_bh should be the rightmost leaf block for any inode with a
- * btree. Since a split may grow the tree or a merge might shrink it, the
caller cannot trust the contents of that buffer after this call.
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
  *
  * This code is optimized for readability - several passes might be
  * made over certain portions of the tree. All of those blocks will
@@ -4327,7 +4420,7 @@ out:
  * extra overhead is not expressed in terms of disk reads.
  */
 static int __ocfs2_mark_extent_written(struct inode *inode,
-				       struct buffer_head *di_bh,
+				       struct ocfs2_extent_tree *et,
 				       handle_t *handle,
 				       struct ocfs2_path *path,
 				       int split_index,
@@ -4367,10 +4460,9 @@ static int __ocfs2_mark_extent_written(struct inode
*inode,
 	 */
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
-		struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
+				       et->eops->get_last_eb_blk(et),
 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (ret) {
 			mlog_exit(ret);
@@ -4404,7 +4496,7 @@ static int __ocfs2_mark_extent_written(struct inode
*inode,
 		if (ctxt.c_split_covers_rec)
 			el->l_recs[split_index] = *split_rec;
 		else
-			ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
 						     split_rec, meta_ac);
 		if (ret)
@@ -4412,7 +4504,7 @@ static int __ocfs2_mark_extent_written(struct inode
*inode,
 	} else {
 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
 						split_index, split_rec,
-						dealloc, &ctxt);
+						dealloc, &ctxt, et);
 		if (ret)
 			mlog_errno(ret);
 	}
@@ -4430,16 +4522,18 @@ out:
  *
  * The caller is responsible for passing down meta_ac if we'll need it.
  */
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc)
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      enum ocfs2_extent_tree_type et_type)
 {
 	int ret, index;
 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
 	struct ocfs2_extent_rec split_rec;
 	struct ocfs2_path *left_path = NULL;
 	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_tree *et = NULL;
 
 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
@@ -4453,13 +4547,21 @@ int ocfs2_mark_extent_written(struct inode *inode,
struct buffer_head *di_bh,
 		goto out;
 	}
 
+	et = ocfs2_new_extent_tree(root_bh, et_type);
+	if (!et) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/*
 	 * XXX: This should be fixed up so that we just re-insert the
 	 * next extent records.
 	 */
-	ocfs2_extent_map_trunc(inode, 0);
+	if (et_type == OCFS2_DINODE_EXTENT)
+		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_inode_path(di_bh);
+	left_path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4490,23 +4592,25 @@ int ocfs2_mark_extent_written(struct inode *inode,
struct buffer_head *di_bh,
 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
 
-	ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
-					  index, &split_rec, meta_ac, dealloc);
+	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
+					  index, &split_rec, meta_ac,
+					  dealloc);
 	if (ret)
 		mlog_errno(ret);
 
 out:
 	ocfs2_free_path(left_path);
+	if (et)
+		ocfs2_free_extent_tree(et);
 	return ret;
 }
 
-static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 			    handle_t *handle, struct ocfs2_path *path,
 			    int index, u32 new_range,
 			    struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, depth, credits = handle->h_buffer_credits;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *rightmost_el, *el;
@@ -4524,7 +4628,7 @@ static int ocfs2_split_tree(struct inode *inode, struct
buffer_head *di_bh,
 	depth = path->p_tree_depth;
 	if (depth > 0) {
 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
+				       et->eops->get_last_eb_blk(et),
 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -4537,7 +4641,7 @@ static int ocfs2_split_tree(struct inode *inode, struct
buffer_head *di_bh,
 		rightmost_el = path_leaf_el(path);
 
 	credits += path->p_tree_depth +
-		   ocfs2_extend_meta_needed(&di->id2.i_list);
+		   ocfs2_extend_meta_needed(et->root_el);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -4546,7 +4650,7 @@ static int ocfs2_split_tree(struct inode *inode, struct
buffer_head *di_bh,
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) = 	   
le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
 				      meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4560,7 +4664,7 @@ static int ocfs2_split_tree(struct inode *inode, struct
buffer_head *di_bh,
 	insert.ins_split = SPLIT_RIGHT;
 	insert.ins_tree_depth = depth;
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
&insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4572,7 +4676,8 @@ out:
 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 			      struct ocfs2_path *path, int index,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      u32 cpos, u32 len)
+			      u32 cpos, u32 len,
+			      struct ocfs2_extent_tree *et)
 {
 	int ret;
 	u32 left_cpos, rec_range, trunc_range;
@@ -4584,7 +4689,7 @@ static int ocfs2_truncate_rec(struct inode *inode,
handle_t *handle,
 	struct ocfs2_extent_block *eb;
 
 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4715,7 +4820,7 @@ static int ocfs2_truncate_rec(struct inode *inode,
handle_t *handle,
 
 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
 
-	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4726,20 +4831,29 @@ out:
 	return ret;
 }
 
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
-			struct ocfs2_cached_dealloc_ctxt *dealloc)
+			struct ocfs2_cached_dealloc_ctxt *dealloc,
+			enum ocfs2_extent_tree_type et_type)
 {
 	int ret, index;
 	u32 rec_range, trunc_range;
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
-	struct ocfs2_path *path;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_tree *et = NULL;
+
+	et = ocfs2_new_extent_tree(root_bh, et_type);
+	if (!et) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->root_bh, et->root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4792,13 +4906,13 @@ int ocfs2_remove_extent(struct inode *inode, struct
buffer_head *di_bh,
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	} else {
-		ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+		ret = ocfs2_split_tree(inode, et, handle, path, index,
 				       trunc_range, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4847,7 +4961,7 @@ int ocfs2_remove_extent(struct inode *inode, struct
buffer_head *di_bh,
 		}
 
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4856,6 +4970,8 @@ int ocfs2_remove_extent(struct inode *inode, struct
buffer_head *di_bh,
 
 out:
 	ocfs2_free_path(path);
+	if (et)
+		ocfs2_free_extent_tree(et);
 	return ret;
 }
 
@@ -6364,7 +6480,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode
*inode,
 		 * the in-inode data from our pages.
 		 */
 		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
-					  0, block, 1, 0, NULL);
+					  0, block, 1, 0,
+					  NULL, OCFS2_DINODE_EXTENT);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
@@ -6406,13 +6523,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	handle_t *handle = NULL;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_path *path = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
 
 	mlog_entry_void();
 
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	path = ocfs2_new_inode_path(fe_bh);
+	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
 	if (!path) {
 		status = -ENOMEM;
 		mlog_errno(status);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 249e79e..5a460a9 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,28 +26,37 @@
 #ifndef OCFS2_ALLOC_H
 #define OCFS2_ALLOC_H
 
+enum ocfs2_extent_tree_type {
+	OCFS2_DINODE_EXTENT = 0,
+};
+
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct buffer_head *root_bh,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
 			u8 flags,
-			struct ocfs2_alloc_context *meta_ac);
+			struct ocfs2_alloc_context *meta_ac,
+			enum ocfs2_extent_tree_type et_type);
 struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
-			      struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+			      struct ocfs2_cached_dealloc_ctxt *dealloc,
+			      enum ocfs2_extent_tree_type et_type);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
-			struct ocfs2_cached_dealloc_ctxt *dealloc);
+			struct ocfs2_cached_dealloc_ctxt *dealloc,
+			enum ocfs2_extent_tree_type et_type);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct buffer_head *bh);
+			   struct buffer_head *root_bh,
+			   enum ocfs2_extent_tree_type et_type);
+
 /*
  * how many new metadata chunks would an allocation need at maximum?
  *
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f83a2a4..009dcc4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1268,7 +1268,8 @@ static int ocfs2_write_cluster(struct address_space
*mapping,
 	} else if (unwritten) {
 		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
 						wc->w_handle, cpos, 1, phys,
-						meta_ac, &wc->w_dealloc);
+						meta_ac, &wc->w_dealloc,
+						OCFS2_DINODE_EXTENT);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1702,7 +1703,13 @@ int ocfs2_write_begin_nolock(struct address_space
*mapping,
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		ret = ocfs2_lock_allocators(inode, wc->w_di_bh,
+		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
+		     " clusters_to_add = %u, extents_to_split = %u\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+		     clusters_to_alloc, extents_to_split);
+
+		ret = ocfs2_lock_allocators(inode, wc->w_di_bh, &di->id2.i_list,
 					    clusters_to_alloc, extents_to_split,
 					    &data_ac, &meta_ac);
 		if (ret) {
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 5e8cd6d..8c10158 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1307,7 +1307,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir,
struct buffer_head *di_bh,
 	 * related blocks have been journaled already.
 	 */
 	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
-				  NULL);
+				  NULL, OCFS2_DINODE_EXTENT);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1333,7 +1333,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir,
struct buffer_head *di_bh,
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 
 		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
-					  len, 0, NULL);
+					  len, 0, NULL, OCFS2_DINODE_EXTENT);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1476,7 +1476,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		num_free_extents = ocfs2_num_free_extents(osb, dir,
-							  parent_fe_bh);
+							  parent_fe_bh,
+							  OCFS2_DINODE_EXTENT);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 79d7da9..cc292ee 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -521,7 +521,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh);
+	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh,
+					      OCFS2_DINODE_EXTENT);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -570,7 +571,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
 				     *logical_offset, block, num_bits,
-				     flags, meta_ac);
+				     flags, meta_ac, OCFS2_DINODE_EXTENT);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -599,92 +600,6 @@ leave:
 	return status;
 }
 
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *di_bh,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac)
-{
-	int ret = 0, num_free_extents;
-	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-
-	*meta_ac = NULL;
-	if (data_ac)
-		*data_ac = NULL;
-
-	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-
-	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u, extents_to_split = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long
long)i_size_read(inode),
-	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-
-	num_free_extents = ocfs2_num_free_extents(osb, inode, di_bh);
-	if (num_free_extents < 0) {
-		ret = num_free_extents;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/*
-	 * Sparse allocation file systems need to be more conservative
-	 * with reserving room for expansion - the actual allocation
-	 * happens while we've got a journal handle open so re-taking
-	 * a cluster lock (because we ran out of room for another
-	 * extent) will violate ordering rules.
-	 *
-	 * Most of the time we'll only be seeing this 1 cluster at a time
-	 * anyway.
-	 *
-	 * Always lock for any unwritten extents - we might want to
-	 * add blocks during a split.
-	 */
-	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents <
max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, &di->id2.i_list, meta_ac);
-		if (ret < 0) {
-			if (ret != -ENOSPC)
-				mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	if (clusters_to_add == 0)
-		goto out;
-
-	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
-	if (ret < 0) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
-
-out:
-	if (ret) {
-		if (*meta_ac) {
-			ocfs2_free_alloc_context(*meta_ac);
-			*meta_ac = NULL;
-		}
-
-		/*
-		 * We cannot have an error and a non null *data_ac.
-		 */
-	}
-
-	return ret;
-}
-
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 				     u32 clusters_to_add, int mark_unwritten)
 {
@@ -725,7 +640,13 @@ static int __ocfs2_extend_allocation(struct inode *inode,
u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	status = ocfs2_lock_allocators(inode, bh, clusters_to_add, 0, &data_ac,
+	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+	     clusters_to_add);
+	status = ocfs2_lock_allocators(inode, bh, &fe->id2.i_list,
+				       clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
 		mlog_errno(status);
@@ -1397,7 +1318,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
-	ret = ocfs2_lock_allocators(inode, di_bh, 0, 1, NULL, &meta_ac);
+	ret = ocfs2_lock_allocators(inode, di_bh, &di->id2.i_list,
+				    0, 1, NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -1428,7 +1350,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	}
 
 	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
-				  dealloc);
+				  dealloc, OCFS2_DINODE_EXTENT);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e38ecb2..e090ff2 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -55,10 +55,6 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *fe,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index af769a5..1992a6a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1891,3 +1891,85 @@ static inline void ocfs2_debug_suballoc_inode(struct
ocfs2_dinode *fe)
 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
 	}
 }
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
+			  struct ocfs2_extent_list *root_el,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+	num_free_extents = ocfs2_num_free_extents(osb, inode, root_bh,
+						  OCFS2_DINODE_EXTENT);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents <
max_recs_needed)) {
+		ret = ocfs2_reserve_new_metadata(osb, root_el, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_to_add == 0)
+		goto out;
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index d024c69..df19591 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -161,4 +161,9 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32
cluster);
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
 				 struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct buffer_head *root_bh,
+			  struct ocfs2_extent_list *root_el,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 #endif /* _CHAINALLOC_H_ */
-- 
1.5.4.GIT

Tao Ma

2008-Jun-05 07:33 UTC

head link

[Ocfs2-devel] [PATCH 4/8] Make extend allocation generic.v1

The old ocfs2_do_extend_allocation is restrictly to be used in file
extension. Now a new function named ocfs2_do_cluster_allocation will
handle the issue of generic extend allocation and it is created in
suballoc.c.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/file.c     |   94 +++-------------------------------------------
 fs/ocfs2/file.h     |    6 +--
 fs/ocfs2/suballoc.c |  103 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/suballoc.h |   18 +++++++++
 4 files changed, 128 insertions(+), 93 deletions(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index cc292ee..69ef6ba 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -508,96 +508,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason_ret)
 {
-	int status = 0;
-	int free_extents;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	enum ocfs2_alloc_restarted reason = RESTART_NONE;
-	u32 bit_off, num_bits;
-	u64 block;
-	u8 flags = 0;
-
-	BUG_ON(!clusters_to_add);
-
-	if (mark_unwritten)
-		flags = OCFS2_EXT_UNWRITTEN;
-
-	free_extents = ocfs2_num_free_extents(osb, inode, fe_bh,
-					      OCFS2_DINODE_EXTENT);
-	if (free_extents < 0) {
-		status = free_extents;
-		mlog_errno(status);
-		goto leave;
-	}
-
-	/* there are two cases which could cause us to EAGAIN in the
-	 * we-need-more-metadata case:
-	 * 1) we haven't reserved *any*
-	 * 2) we are so fragmented, we've needed to add metadata too
-	 *    many times. */
-	if (!free_extents && !meta_ac) {
-		mlog(0, "we haven't reserved any metadata!\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	} else if ((!free_extents)
-		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(&fe->id2.i_list))) {
-		mlog(0, "filesystem is really fragmented...\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	}
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
 
-	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
-					clusters_to_add, &bit_off, &num_bits);
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
-		goto leave;
-	}
-
-	BUG_ON(num_bits > clusters_to_add);
-
-	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
-	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-				     *logical_offset, block, num_bits,
-				     flags, meta_ac, OCFS2_DINODE_EXTENT);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	clusters_to_add -= num_bits;
-	*logical_offset += num_bits;
-
-	if (clusters_to_add) {
-		mlog(0, "need to alloc once more, clusters = %u, wanted = "
-		     "%u\n", fe->i_clusters, clusters_to_add);
-		status = -EAGAIN;
-		reason = RESTART_TRANS;
-	}
-
-leave:
-	mlog_exit(status);
-	if (reason_ret)
-		*reason_ret = reason;
-	return status;
+	return ocfs2_do_cluster_allocation(osb, inode, logical_offset,
+					   &clusters_to_add, mark_unwritten,
+					   fe_bh, el, handle,
+					   data_ac, meta_ac, reason_ret,
+					   OCFS2_DINODE_EXTENT);
 }
 
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e090ff2..1ef2ac3 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -31,6 +31,7 @@ extern const struct file_operations ocfs2_dops;
 extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
 
 struct ocfs2_file_private {
 	struct file		*fp_file;
@@ -38,11 +39,6 @@ struct ocfs2_file_private {
 	struct ocfs2_lock_res	fp_flock;
 };
 
-enum ocfs2_alloc_restarted {
-	RESTART_NONE = 0,
-	RESTART_TRANS,
-	RESTART_META
-};
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
 			       u32 *logical_offset,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 1992a6a..c953796 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1973,3 +1973,106 @@ out:
 
 	return ret;
 }
+
+int ocfs2_do_cluster_allocation(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 *clusters_to_add,
+				int mark_unwritten,
+				struct buffer_head *root_bh,
+				struct ocfs2_extent_list *root_el,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret,
+				enum ocfs2_extent_tree_type type)
+{
+	int status = 0;
+	int free_extents;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+	u8 flags = 0;
+
+	BUG_ON(!clusters_to_add);
+
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
+	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(root_el))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+					*clusters_to_add, &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > *clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
+				     *logical_offset, block, num_bits,
+				     flags, meta_ac, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, root_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	*clusters_to_add -= num_bits;
+	*logical_offset += num_bits;
+
+	if (*clusters_to_add) {
+		mlog(0, "need to alloc once more, wanted = %u\n",
+		     *clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
+	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
+	return status;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index df19591..fff02f9 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -166,4 +166,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct
buffer_head *root_bh,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac);
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_do_cluster_allocation(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 *clusters_to_add,
+				int mark_unwritten,
+				struct buffer_head *root_bh,
+				struct ocfs2_extent_list *root_el,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret,
+			       enum ocfs2_extent_tree_type type);
 #endif /* _CHAINALLOC_H_ */
-- 
1.5.4.GIT

Tao Ma

2008-Jun-05 07:33 UTC

head link

[Ocfs2-devel] [PATCH 5/8] Add xattr header in ocfs2.v1

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/xattr.h |   95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 95 insertions(+), 0 deletions(-)
 create mode 100644 fs/ocfs2/xattr.h

diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 0000000..a5b93cf
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,95 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define OCFS2_XATTR_BLOCK_SIGNATURE_1	"XATTR01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE_2	"XATTR02"
+
+enum ocfs2_xattr_type {
+	OCFS2_XATTR_INDEX_USER = 1,
+	OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+	OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	OCFS2_XATTR_INDEX_TRUSTED,
+	OCFS2_XATTR_INDEX_LUSTRE,
+	OCFS2_XATTR_INDEX_SECURITY,
+	OCFS2_XATTR_MAX
+};
+
+struct ocfs2_xattr_entry {
+	__le32	xe_name_hash;
+	__le16	xe_name_offset;
+	__u8	xe_name_len;
+	__u8	xe_type : 7;
+	__u8	xe_local : 1;
+	__le64	xe_value_size;
+};
+
+struct ocfs2_xattr_header {
+	__le16	xh_count;
+	__le16	xh_reserved1;
+	__le32	xh_csum;
+	__le16  xh_reserved2[4];
+	struct ocfs2_xattr_entry	xh_entries[0];
+};
+
+struct ocfs2_xattr_value_root {
+/*00*/	__le32	xr_clusters;
+	__le32	xr_reserved0;
+	__le64	xr_last_eb_blk;
+/*10*/	struct ocfs2_extent_list	xr_list;
+};
+
+struct ocfs2_xattr_tree_root {
+/*00*/	__le32	xt_clusters;
+	__le32	xt_reserved0;
+	__le64	xt_last_eb_blk;
+/*10*/	struct ocfs2_extent_list	xt_list;
+};
+
+#define OCFS2_XATTR_INDEXED 0x1
+
+struct ocfs2_xattr_block {
+/*00*/	__u8	xb_signature[8];
+	__le16	xb_suballoc_slot;
+	__le16	xb_suballoc_bit;
+	__le32	xb_fs_generation;
+/*10*/	__le32	xb_csum;
+	__le16	xb_flags;
+	__le16	xb_reserved0;
+	__le64	xb_blkno;
+/*20*/	__le64	xb_reserved1[2];
+/*30*/	union {
+		struct ocfs2_xattr_header	xb_header;
+		struct ocfs2_xattr_tree_root	xb_root;
+	} xb_attrs;
+};
+
+#endif /* OCFS2_XATTR_H */
-- 
1.5.4.GIT

Tao Ma

2008-Jun-05 07:34 UTC

head link

[Ocfs2-devel] [PATCH 6/8] Add extent tree operation for xattr value.v1

When storing xattr value which is too large, we will allocate some clusters
for it and here ocfs2_extent_list and ocfs2_extent_rec will also be used.
In order to re-use the b-tree operation code, a new parameter named
"private"
is added into ocfs2_extent_tree and it is used to indicate the root of
ocfs2_exent_list. The reason is that we can't deduce the root from the
buffer_head now. It may be in an inode, an ocfs2_xattr_block or even worse,
in any place in an ocfs2_xattr_bucket.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/Makefile     |    3 +-
 fs/ocfs2/alloc.c      |   72 ++++++++++--
 fs/ocfs2/alloc.h      |   13 ++-
 fs/ocfs2/aops.c       |    5 +-
 fs/ocfs2/dir.c        |    8 +-
 fs/ocfs2/extent_map.c |   60 ++++++++++
 fs/ocfs2/extent_map.h |    3 +
 fs/ocfs2/file.c       |    9 +-
 fs/ocfs2/suballoc.c   |   13 ++-
 fs/ocfs2/suballoc.h   |    6 +-
 fs/ocfs2/xattr.c      |  301 +++++++++++++++++++++++++++++++++++++++++++++++++
 11 files changed, 461 insertions(+), 32 deletions(-)
 create mode 100644 fs/ocfs2/xattr.c

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de..af63980 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
 	symlink.o 		\
 	sysfile.o 		\
 	uptodate.o		\
-	ver.o
+	ver.o			\
+	xattr.o			\
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 71a89b2..9aba2c1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -46,6 +46,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -60,6 +61,7 @@ struct ocfs2_extent_tree {
 	struct ocfs2_extent_tree_operations *eops;
 	struct buffer_head *root_bh;
 	struct ocfs2_extent_list *root_el;
+	void *private;
 };
 
 static void ocfs2_dinode_set_last_eb_blk(void *p, u64 blkno)
@@ -105,9 +107,39 @@ static struct ocfs2_extent_tree_operations
ocfs2_dinode_et_ops = {
 	.sanity_check		= ocfs2_dinode_sanity_check,
 };
 
+static void ocfs2_xattr_value_set_last_eb_blk(void *p, u64 blkno)
+{
+	struct ocfs2_extent_tree *et = (struct ocfs2_extent_tree *)p;
+	struct ocfs2_xattr_value_root *xv +		(struct ocfs2_xattr_value_root
*)et->private;
+
+	xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(void *p)
+{
+	struct ocfs2_extent_tree *et = (struct ocfs2_extent_tree *)p;
+	struct ocfs2_xattr_value_root *xv +		(struct ocfs2_xattr_value_root *)
et->private;
+
+	return le64_to_cpu(xv->xr_last_eb_blk);
+}
+
+static int ocfs2_xattr_value_sanity_check(struct inode *inode, void *p)
+{
+	return 0;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
+	.set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
+	.get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
+	.sanity_check		= ocfs2_xattr_value_sanity_check,
+};
+
 static struct ocfs2_extent_tree*
 	 ocfs2_new_extent_tree(struct buffer_head *bh,
-			       enum ocfs2_extent_tree_type et_type)
+			       enum ocfs2_extent_tree_type et_type,
+			       void *private)
 {
 	struct ocfs2_extent_tree *et;
 
@@ -118,12 +150,16 @@ static struct ocfs2_extent_tree*
 	et->type = et_type;
 	get_bh(bh);
 	et->root_bh = bh;
+	et->private = private;
 
-	/* current we only support dinode extent. */
-	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
 	if (et_type == OCFS2_DINODE_EXTENT) {
 		et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
 		et->eops = &ocfs2_dinode_et_ops;
+	} else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
+		struct ocfs2_xattr_value_root *xv +			(struct ocfs2_xattr_value_root *)
private;
+		et->root_el = &xv->xr_list;
+		et->eops = &ocfs2_xattr_et_ops;
 	}
 
 	return et;
@@ -446,7 +482,8 @@ struct ocfs2_merge_ctxt {
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *root_bh,
-			   enum ocfs2_extent_tree_type type)
+			   enum ocfs2_extent_tree_type type,
+			   void *private)
 {
 	int retval;
 	struct ocfs2_extent_list *el = NULL;
@@ -468,6 +505,12 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 		if (fe->i_last_eb_blk)
 			last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
 		el = &fe->id2.i_list;
+	} else if (type == OCFS2_XATTR_VALUE_EXTENT) {
+		struct ocfs2_xattr_value_root *xv +			(struct ocfs2_xattr_value_root *)
private;
+
+		last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
+		el = &xv->xr_list;
 	}
 
 	if (last_eb_blk) {
@@ -3450,6 +3493,10 @@ static void ocfs2_update_clusters(struct inode *inode,
 		struct ocfs2_dinode *di  			(struct ocfs2_dinode *)et->root_bh->b_data;
 		ocfs2_update_dinode_clusters(inode, di, clusters);
+	} else if (et->type == OCFS2_XATTR_VALUE_EXTENT) {
+		struct ocfs2_xattr_value_root *xv +			(struct ocfs2_xattr_value_root
*)et->private;
+		le32_add_cpu(&xv->xr_clusters, clusters);
 	}
 }
 
@@ -4199,7 +4246,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u32 new_clusters,
 			u8 flags,
 			struct ocfs2_alloc_context *meta_ac,
-			enum ocfs2_extent_tree_type et_type)
+			enum ocfs2_extent_tree_type et_type,
+			void *private)
 {
 	int status;
 	int uninitialized_var(free_records);
@@ -4210,7 +4258,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 
 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 
-	et = ocfs2_new_extent_tree(root_bh, et_type);
+	et = ocfs2_new_extent_tree(root_bh, et_type, private);
 	if (!et) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -4526,7 +4574,8 @@ int ocfs2_mark_extent_written(struct inode *inode, struct
buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      enum ocfs2_extent_tree_type et_type)
+			      enum ocfs2_extent_tree_type et_type,
+			      void *private)
 {
 	int ret, index;
 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
@@ -4547,7 +4596,7 @@ int ocfs2_mark_extent_written(struct inode *inode, struct
buffer_head *root_bh,
 		goto out;
 	}
 
-	et = ocfs2_new_extent_tree(root_bh, et_type);
+	et = ocfs2_new_extent_tree(root_bh, et_type, private);
 	if (!et) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4835,7 +4884,8 @@ int ocfs2_remove_extent(struct inode *inode, struct
buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc,
-			enum ocfs2_extent_tree_type et_type)
+			enum ocfs2_extent_tree_type et_type,
+			void *private)
 {
 	int ret, index;
 	u32 rec_range, trunc_range;
@@ -4844,7 +4894,7 @@ int ocfs2_remove_extent(struct inode *inode, struct
buffer_head *root_bh,
 	struct ocfs2_path *path = NULL;
 	struct ocfs2_extent_tree *et = NULL;
 
-	et = ocfs2_new_extent_tree(root_bh, et_type);
+	et = ocfs2_new_extent_tree(root_bh, et_type, private);
 	if (!et) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -6481,7 +6531,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode
*inode,
 		 */
 		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
 					  0, block, 1, 0,
-					  NULL, OCFS2_DINODE_EXTENT);
+					  NULL, OCFS2_DINODE_EXTENT, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 5a460a9..b50ace5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -28,6 +28,7 @@
 
 enum ocfs2_extent_tree_type {
 	OCFS2_DINODE_EXTENT = 0,
+	OCFS2_XATTR_VALUE_EXTENT,
 };
 
 struct ocfs2_alloc_context;
@@ -40,22 +41,26 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u32 new_clusters,
 			u8 flags,
 			struct ocfs2_alloc_context *meta_ac,
-			enum ocfs2_extent_tree_type et_type);
+			enum ocfs2_extent_tree_type et_type,
+			void *private);
 struct ocfs2_cached_dealloc_ctxt;
 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      enum ocfs2_extent_tree_type et_type);
+			      enum ocfs2_extent_tree_type et_type,
+			      void *private);
 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc,
-			enum ocfs2_extent_tree_type et_type);
+			enum ocfs2_extent_tree_type et_type,
+			void *private);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *root_bh,
-			   enum ocfs2_extent_tree_type et_type);
+			   enum ocfs2_extent_tree_type et_type,
+			   void *private);
 
 /*
  * how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 009dcc4..b1b7750 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1269,7 +1269,7 @@ static int ocfs2_write_cluster(struct address_space
*mapping,
 		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
 						wc->w_handle, cpos, 1, phys,
 						meta_ac, &wc->w_dealloc,
-						OCFS2_DINODE_EXTENT);
+						OCFS2_DINODE_EXTENT, NULL);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1711,7 +1711,8 @@ int ocfs2_write_begin_nolock(struct address_space
*mapping,
 
 		ret = ocfs2_lock_allocators(inode, wc->w_di_bh, &di->id2.i_list,
 					    clusters_to_alloc, extents_to_split,
-					    &data_ac, &meta_ac);
+					    &data_ac, &meta_ac,
+					    OCFS2_DINODE_EXTENT, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8c10158..b3e5cc5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1307,7 +1307,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir,
struct buffer_head *di_bh,
 	 * related blocks have been journaled already.
 	 */
 	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
-				  NULL, OCFS2_DINODE_EXTENT);
+				  NULL, OCFS2_DINODE_EXTENT, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1333,7 +1333,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir,
struct buffer_head *di_bh,
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 
 		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
-					  len, 0, NULL, OCFS2_DINODE_EXTENT);
+					  len, 0, NULL, OCFS2_DINODE_EXTENT,
+					  NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1477,7 +1478,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		num_free_extents = ocfs2_num_free_extents(osb, dir,
 							  parent_fe_bh,
-							  OCFS2_DINODE_EXTENT);
+							  OCFS2_DINODE_EXTENT,
+							  NULL);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a..619b20a 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -373,6 +373,66 @@ out:
 	return ret;
 }
 
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	} else {
+		rec = &el->l_recs[i];
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0) in xattr", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	}
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		       u32 *p_cluster, u32 *num_clusters,
 		       unsigned int *extent_flags)
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e..d98444e 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32
*p_cluster,
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 				u64 *ret_count, unsigned int *extent_flags);
 
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el);
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 69ef6ba..e138fec 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -515,7 +515,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 					   &clusters_to_add, mark_unwritten,
 					   fe_bh, el, handle,
 					   data_ac, meta_ac, reason_ret,
-					   OCFS2_DINODE_EXTENT);
+					   OCFS2_DINODE_EXTENT, NULL);
 }
 
 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
@@ -565,7 +565,7 @@ restart_all:
 	     clusters_to_add);
 	status = ocfs2_lock_allocators(inode, bh, &fe->id2.i_list,
 				       clusters_to_add, 0, &data_ac,
-				       &meta_ac);
+				       &meta_ac, OCFS2_DINODE_EXTENT, NULL);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
@@ -1237,7 +1237,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	ret = ocfs2_lock_allocators(inode, di_bh, &di->id2.i_list,
-				    0, 1, NULL, &meta_ac);
+				    0, 1, NULL, &meta_ac,
+				    OCFS2_DINODE_EXTENT, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -1268,7 +1269,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	}
 
 	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
-				  dealloc, OCFS2_DINODE_EXTENT);
+				  dealloc, OCFS2_DINODE_EXTENT, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c953796..464870c 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1906,7 +1906,8 @@ int ocfs2_lock_allocators(struct inode *inode, struct
buffer_head *root_bh,
 			  struct ocfs2_extent_list *root_el,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac)
+			  struct ocfs2_alloc_context **meta_ac,
+			  enum ocfs2_extent_tree_type type, void *private)
 {
 	int ret = 0, num_free_extents;
 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
@@ -1919,7 +1920,7 @@ int ocfs2_lock_allocators(struct inode *inode, struct
buffer_head *root_bh,
 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
 
 	num_free_extents = ocfs2_num_free_extents(osb, inode, root_bh,
-						  OCFS2_DINODE_EXTENT);
+						  type, private);
 	if (num_free_extents < 0) {
 		ret = num_free_extents;
 		mlog_errno(ret);
@@ -1985,7 +1986,8 @@ int ocfs2_do_cluster_allocation(struct ocfs2_super *osb,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret,
-				enum ocfs2_extent_tree_type type)
+				enum ocfs2_extent_tree_type type,
+				void *private)
 {
 	int status = 0;
 	int free_extents;
@@ -1999,7 +2001,8 @@ int ocfs2_do_cluster_allocation(struct ocfs2_super *osb,
 	if (mark_unwritten)
 		flags = OCFS2_EXT_UNWRITTEN;
 
-	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type);
+	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
+					      private);
 	if (free_extents < 0) {
 		status = free_extents;
 		mlog_errno(status);
@@ -2048,7 +2051,7 @@ int ocfs2_do_cluster_allocation(struct ocfs2_super *osb,
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
 				     *logical_offset, block, num_bits,
-				     flags, meta_ac, type);
+				     flags, meta_ac, type, private);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fff02f9..592fc83 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -165,7 +165,8 @@ int ocfs2_lock_allocators(struct inode *inode, struct
buffer_head *root_bh,
 			  struct ocfs2_extent_list *root_el,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac);
+			  struct ocfs2_alloc_context **meta_ac,
+			  enum ocfs2_extent_tree_type type, void *private);
 
 enum ocfs2_alloc_restarted {
 	RESTART_NONE = 0,
@@ -183,5 +184,6 @@ int ocfs2_do_cluster_allocation(struct ocfs2_super *osb,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret,
-			       enum ocfs2_extent_tree_type type);
+				enum ocfs2_extent_tree_type type,
+				void *private);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 0000000..c223ab0
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,301 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "xattr.h"
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+					 u32 clusters_to_add,
+					 struct buffer_head *xattr_bh,
+					 struct ocfs2_xattr_value_root *xv)
+{
+	int status = 0;
+	int restart_func = 0;
+	int credits = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_list *root_el = &xv->xr_list;
+	u32 logical_start = le16_to_cpu(xv->xr_clusters);
+
+	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
+
+restart_all:
+
+	status = ocfs2_lock_allocators(inode, xattr_bh, root_el,
+				       clusters_to_add, 0, &data_ac,
+				       &meta_ac, OCFS2_XATTR_VALUE_EXTENT, xv);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, root_el, clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	status = ocfs2_journal_access(handle, inode, xattr_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_do_cluster_allocation(osb,
+					    inode,
+					    &logical_start,
+					    &clusters_to_add,
+					    0,
+					    xattr_bh,
+					    root_el,
+					    handle,
+					    data_ac,
+					    meta_ac,
+					    &why,
+					    OCFS2_XATTR_VALUE_EXTENT,
+					    xv);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, xattr_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    root_el,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+leave:
+	if (handle) {
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+
+	mlog_exit(status);
+	return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      u32 cpos, u32 phys_cpos, u32 len,
+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_allocators(inode, root_bh, &xv->xr_list,
+				    0, 1, NULL, &meta_ac,
+				    OCFS2_XATTR_VALUE_EXTENT, xv);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_remove_extent(inode, root_bh, cpos, len, handle, meta_ac,
+				  dealloc, OCFS2_XATTR_VALUE_EXTENT, xv);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xv->xr_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+				   u32 old_clusters,
+				   u32 new_clusters,
+				   struct buffer_head *root_bh,
+				   struct ocfs2_xattr_value_root *xv)
+{
+	int ret = 0;
+	u32 trunc_len, cpos, phys_cpos, alloc_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (old_clusters <= new_clusters)
+		return 0;
+
+	cpos = new_clusters;
+	trunc_len = old_clusters - new_clusters;
+	while (trunc_len) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+					       &alloc_size, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > trunc_len)
+			alloc_size = trunc_len;
+
+		ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+						 phys_cpos, alloc_size,
+						 &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		cpos += alloc_size;
+		trunc_len -= alloc_size;
+	}
+
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+	return 0;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      int len)
+{
+	int ret;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+	u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+
+	if (new_clusters == old_clusters)
+		return 0;
+
+	if (new_clusters > old_clusters)
+		ret = ocfs2_xattr_extend_allocation(inode,
+						    new_clusters - old_clusters,
+						    root_bh, xv);
+	else
+		ret = ocfs2_xattr_shrink_size(inode,
+					      old_clusters, new_clusters,
+					      root_bh, xv);
+
+	return ret;
+}
-- 
1.5.4.GIT

Tao Ma

2008-Jun-05 07:35 UTC

head link

[Ocfs2-devel] [PATCH 8/8] Add large numbers of extended attributes support for ocfs2.v1

Extended attributes is added into ocfs2, but it can be only stored in inode and
one extra block, so the number is very limited. This patch enable ocfs2 to store
large numbers of EAs.

The original design doc is written by Mark Fasheh, and it can be found in
http://oss.oracle.com/osswiki/OCFS2/DesignDocs/IndexedEATrees. Just some little
modifications to it.

First, because the bucket size is 4K, a new field named as xh_offset is added
in ocfs2_xattr_header to indicate the next valid name/value offset in abucket.
It is used when we store new EA name/value. With this field, we can find the
place more quickly and what's more, we dont' need to sort the name/value
every
time to let the last entry indicate the next unused space. Considering when the
blocksize is 512, we may have to update 8 blocks for one insertion if we sort
name/value like the original in-inode xattr. It is definitely inefficient.

Because of the new xh_offset, another field named as xh_name_value_len is also
added in ocfs2_xattr_header. It records the total length of all the name/values
in the bucket. We need this so that we can check it and defragment the bucket
if the bucket is too much fragmented.

So now the insertion will be like this:
1. xattr_index_block_find: find the right bucket by the name_hash, say bucketA.
2. check whether there is enough space in bucketA. If yes, insert it directly
   and modify xh_offset and xh_name_value_len accordingly. If no, check
   xh_name_value_len to see whether we can store this by defragment the bucket.
   If yes, defragment it and go on insertion.
3. If defragement doesnt' work, check whether there is new empty bucket in
   the clusters within this extent record. If yes, init the new bucket and move
   all the buckets after bucketA one by one to the next bucket. Move half of the
   entries in bucketA to the next bucket and go on insertion.
4. If there is no new bucket, grow the extent tree.(This should be the same as
   Mark has described in the design doc).

As for xattr deletion, we will delete an xattr bucket when all the xattr in this
bucket are removed and move all the buckets after it to the previous one. When
all the xattr buckets in an extend record are freed, free this extend records
from ocfs2_xattr_tree.

Two more things. This patch is a bit longer, so please be patient when reviewing
it. ;) I will divide it inot several small ones next time. And some function
names may not be good enough, I may modify them when I collect all of your
advice. So hope you enjoy it.

Signed-off-by: Tao Ma <tao.ma at oracle.com>
---
 fs/ocfs2/alloc.c |   46 +
 fs/ocfs2/alloc.h |    1 +
 fs/ocfs2/xattr.c | 3017 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/xattr.h |    5 +-
 4 files changed, 3028 insertions(+), 41 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9aba2c1..e0ceda6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -136,6 +136,37 @@ static struct ocfs2_extent_tree_operations
ocfs2_xattr_et_ops = {
 	.sanity_check		= ocfs2_xattr_value_sanity_check,
 };
 
+static void ocfs2_xattr_tree_set_last_eb_blk(void *p, u64 blkno)
+{
+	struct ocfs2_extent_tree *et = (struct ocfs2_extent_tree *)p;
+	struct ocfs2_xattr_block *xb +		(struct ocfs2_xattr_block *)
et->root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(void *p)
+{
+	struct ocfs2_extent_tree *et = (struct ocfs2_extent_tree *)p;
+	struct ocfs2_xattr_block *xb +		(struct ocfs2_xattr_block *)
et->root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static int ocfs2_xattr_tree_sanity_check(struct inode *inode, void *p)
+{
+	return 0;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+	.set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
+	.get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
+	.sanity_check		= ocfs2_xattr_tree_sanity_check,
+};
+
 static struct ocfs2_extent_tree*
 	 ocfs2_new_extent_tree(struct buffer_head *bh,
 			       enum ocfs2_extent_tree_type et_type,
@@ -160,6 +191,11 @@ static struct ocfs2_extent_tree*
 			(struct ocfs2_xattr_value_root *) private;
 		et->root_el = &xv->xr_list;
 		et->eops = &ocfs2_xattr_et_ops;
+	} else if (et_type == OCFS2_XATTR_TREE_EXTENT) {
+		struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block *)bh->b_data;
+		et->root_el = &xb->xb_attrs.xb_root.xt_list;
+		et->eops = &ocfs2_xattr_tree_et_ops;
 	}
 
 	return et;
@@ -511,6 +547,12 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 
 		last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
 		el = &xv->xr_list;
+	} else if (type == OCFS2_XATTR_TREE_EXTENT) {
+		struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)root_bh->b_data;
+
+		last_eb_blk = le64_to_cpu(xb->xb_attrs.xb_root.xt_last_eb_blk);
+		el = &xb->xb_attrs.xb_root.xt_list;
 	}
 
 	if (last_eb_blk) {
@@ -3497,6 +3539,10 @@ static void ocfs2_update_clusters(struct inode *inode,
 		struct ocfs2_xattr_value_root *xv  			(struct ocfs2_xattr_value_root
*)et->private;
 		le32_add_cpu(&xv->xr_clusters, clusters);
+	} else if (et->type == OCFS2_XATTR_TREE_EXTENT) {
+		struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)et->root_bh->b_data;
+		le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 	}
 }
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index b50ace5..7587f0e 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -29,6 +29,7 @@
 enum ocfs2_extent_tree_type {
 	OCFS2_DINODE_EXTENT = 0,
 	OCFS2_XATTR_VALUE_EXTENT,
+	OCFS2_XATTR_TREE_EXTENT,
 };
 
 struct ocfs2_alloc_context;
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ed07448..5775e03 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -33,6 +33,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/sort.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -48,6 +49,7 @@
 #include "suballoc.h"
 #include "uptodate.h"
 #include "buffer_head_io.h"
+#include "super.h"
 #include "xattr.h"
 
 
@@ -108,13 +110,36 @@ struct ocfs2_xattr_info {
 struct ocfs2_xattr_search {
 	struct buffer_head *inode_bh;
 	struct buffer_head *xattr_bh;
+	struct buffer_head *header_bh;
 	struct ocfs2_xattr_header *header;
 	void *base;
 	void *end;
 	struct ocfs2_xattr_entry *here;
+	int alloc_base;
 	int not_found;
 };
 
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_list_index_block(struct inode *inode,
+					struct ocfs2_xattr_tree_root *xt,
+					char *buffer,
+					size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs);
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh);
+
 static inline u32 ocfs2_blocks_per_cluster(struct super_block *sb)
 {
 	return 1 << (OCFS2_SB(sb)->s_clustersize_bits -
sb->s_blocksize_bits);
@@ -493,22 +518,28 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 				  size_t buffer_size)
 {
 	struct buffer_head *blk_bh = NULL;
-	struct ocfs2_xattr_header *header = NULL;
+	struct ocfs2_xattr_block *xb;
 	int ret = 0;
 
 	if (!di->i_xattr_loc)
 		return ret;
-	else {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_xattr_loc),
-				       &blk_bh, OCFS2_BH_CACHED, inode);
-		if (ret)
-			return ret;
-	}
 
-	header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
-		 xb_attrs.xb_header;
-	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+			       le64_to_cpu(di->i_xattr_loc),
+			       &blk_bh, OCFS2_BH_CACHED, inode);
+	if (ret)
+		return ret;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		ret = ocfs2_xattr_list_entries(inode, header,
+					       buffer, buffer_size);
+	} else {
+		struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+		ret = ocfs2_xattr_list_index_block(inode, xt,
+						   buffer, buffer_size);
+	}
 
 	if (blk_bh)
 		brelse(blk_bh);
@@ -689,28 +720,35 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 {
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
 	size_t size;
 	int ret = -ENODATA;
 
 	if (!di->i_xattr_loc)
 		return ret;
-	else {
-		struct ocfs2_xattr_block *xb;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_xattr_loc),
-				       &blk_bh, OCFS2_BH_CACHED, inode);
-		if (ret)
-			goto cleanup;
-		xs->xattr_bh = blk_bh;
-		xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+			       le64_to_cpu(di->i_xattr_loc),
+			       &blk_bh, OCFS2_BH_CACHED, inode);
+	if (ret)
+		goto cleanup;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	xs->xattr_bh = blk_bh;
+
+	if (!xb || !(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
 		xs->base = (void *)xs->header;
 		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
 		xs->here = xs->header->xh_entries;
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else {
+		xs->header_bh = NULL;
+		xs->alloc_base = 0;
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
 	}
 
-	ret = ocfs2_xattr_find_entry(name_index, name, xs);
 	if (ret)
 		goto cleanup;
 	size = le64_to_cpu(xs->here->xe_value_size);
@@ -731,8 +769,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	}
 	ret = size;
 cleanup:
-	if (blk_bh)
-		brelse(blk_bh);
+	if (xs->header_bh)
+		brelse(xs->header_bh);
+	if (xs->alloc_base)
+		kfree(xs->base);
+	if (xs->xattr_bh)
+		brelse(xs->xattr_bh);
 	return ret;
 }
 
@@ -1255,13 +1297,14 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 				    struct buffer_head *blk_bh)
 {
 	struct ocfs2_xattr_block *xb;
-	struct ocfs2_xattr_header *header;
 	int ret = 0;
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	header = &(xb->xb_attrs.xb_header);
-
-	ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+		ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+	} else
+		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
 
 	return ret;
 }
@@ -1419,15 +1462,23 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 {
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
 	int ret = 0;
 
-	if (di->i_xattr_loc) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_xattr_loc),
-				       &blk_bh, OCFS2_BH_CACHED, inode);
-		if (ret)
-			return ret;
-		xs->xattr_bh = blk_bh;
+	if (di->i_xattr_loc == 0)
+		return 0;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+			       le64_to_cpu(di->i_xattr_loc),
+			       &blk_bh, OCFS2_BH_CACHED, inode);
+	if (ret)
+		return ret;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	xs->xattr_bh = blk_bh;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
 		xs->header = &((struct ocfs2_xattr_block *)blk_bh->b_data)->
 				xb_attrs.xb_header;
 		xs->base = (void *)xs->header;
@@ -1435,17 +1486,62 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 		xs->here = xs->header->xh_entries;
 
 		ret = ocfs2_xattr_find_entry(name_index, name, xs);
-		if (ret && ret != -ENODATA)
-			goto cleanup;
-		xs->not_found = ret;
-		return 0;
-	}
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
+	if (ret && ret != -ENODATA)
+		goto cleanup;
+	xs->not_found = ret;
+	return 0;
+
 cleanup:
 	if (blk_bh)
 		brelse(blk_bh);
 	return ret;
 }
 
+static int ocfs2_restore_xattr_block(struct inode *inode,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_block *xb +		(struct ocfs2_xattr_block
*)xs->xattr_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+	BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
+		le16_to_cpu(el->l_next_free_rec) != 0);
+
+	handle = ocfs2_start_trans(osb, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memset(&xb->xb_attrs, 0, sizeof(struct ocfs2_xattr_header));
+
+	xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
+
+	ocfs2_journal_dirty(handle, xs->xattr_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
 static int ocfs2_xattr_block_set(struct inode *inode,
 				 struct ocfs2_xattr_info *xi,
 				 struct ocfs2_xattr_search *xs)
@@ -1527,8 +1623,24 @@ out:
 			ocfs2_free_alloc_context(meta_ac);
 		if (ret < 0)
 			return ret;
+	} else
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		ret = ocfs2_xattr_set_entry(inode, xi, xs);
+		if (!ret || ret != -ENOSPC)
+			goto end;
+
+		ret = ocfs2_xattr_create_index_block(inode, xs);
+		if (ret)
+			goto end;
 	}
-	ret = ocfs2_xattr_set_entry(inode, xi, xs);
+
+	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+	if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
+		ret = ocfs2_restore_xattr_block(inode, xs);
+
+end:
 	if (!ret && !(le16_to_cpu(di->i_dyn_features) &
OCFS2_HAS_XATTR_FL))
 		ocfs2_xattr_update_flag(inode,
 					xs->inode_bh,
@@ -1626,6 +1738,2831 @@ cleanup:
 		brelse(di_bh);
 	if (xbs.xattr_bh)
 		brelse(xbs.xattr_bh);
+	if (xbs.alloc_base)
+		kfree(xbs.base);
+	if (xbs.header_bh)
+		brelse(xbs.header_bh);
+	return ret;
+}
+
+#define OCFS2_XATTR_BUCKET_SIZE			4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET 	8
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
+static inline u32 ocfs2_xattr_hash_by_name(int name_index,
+					   const char *suffix_name)
+{
+	struct xattr_handler *handler = ocfs2_xattr_handler(name_index);
+	char *prefix = handler->prefix;
+	int prefix_len = strlen(handler->prefix);
+
+	return ocfs2_xattr_name_hash(prefix, prefix_len, (char *)suffix_name,
+				     strlen(suffix_name));
+}
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_bucket(struct inode *inode,
+				  u32 name_hash,
+				  u64 *p_blkno,
+				  u32 *e_cpos,
+				  u32 *num_clusters,
+				  struct ocfs2_extent_list *el)
+{
+	int ret, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+	u64 e_blkno = 0;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+			e_blkno = le64_to_cpu(rec->e_blkno);
+			break;
+		}
+	}
+
+	if (!e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in xattr", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*p_blkno = le64_to_cpu(rec->e_blkno);
+	*num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	if (e_cpos)
+		*e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
+/*
+ * Get the xattr entry at offset in a bucket(starting from start_blkno).
+ *
+ * The bh is the block which contains this entry. If the entry is in the same
+ * block as the header, it will set as NULL.
+ * Please note that the whole xattr entry will always be in the same block.
+ */
+static struct ocfs2_xattr_entry*
+	ocfs2_get_xe_in_bucket(struct inode *inode,
+			       struct buffer_head *header_bh,
+			       struct buffer_head **bh,
+			       u16 offset)
+{
+	int ret;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)header_bh->b_data;
+	struct ocfs2_xattr_entry *xe = NULL;
+	u16 xe_count = le16_to_cpu(xh->xh_count);
+	u16 xe_off, block_off;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	u64 start_blkno = header_bh->b_blocknr;
+
+	*bh = NULL;
+
+	if (offset >= xe_count)
+		return NULL;
+
+	xe_off = sizeof(struct ocfs2_xattr_header) +
+			offset * sizeof(struct ocfs2_xattr_entry);
+	block_off = xe_off / blocksize;
+
+	if (block_off == 0)
+		xe = &xh->xh_entries[offset];
+	else {
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       start_blkno + block_off,
+				       bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xe_off = xe_off % blocksize;
+		xe = (struct ocfs2_xattr_entry *)((*bh)->b_data + xe_off);
+	}
+
+out:
+	return xe;
+}
+
+/*
+ * Get a range of bytes in the bucket.
+ * If store is not NULL, copy the bytes to store.
+ */
+static int ocfs2_get_range_in_bucket(struct inode *inode,
+				     struct buffer_head *header_bh,
+				     u16 start_offset,
+				     u16 len,
+				     char *store)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+	u16 read_len = 0, read, offset = start_offset;
+	u16 block_off;
+	int blocksize = inode->i_sb->s_blocksize;
+	u64 start_blkno = header_bh->b_blocknr;
+
+	if (start_offset >= OCFS2_XATTR_BUCKET_SIZE ||
+	    start_offset + len > OCFS2_XATTR_BUCKET_SIZE)
+		return -EINVAL;
+
+	while (len > 0) {
+		block_off = start_offset / blocksize;
+		offset = start_offset % blocksize;
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       start_blkno + block_off,
+				       &bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		read = (blocksize - offset) <= len ?
+				 (blocksize - offset) : len;
+		memcpy(store + read_len, bh->b_data + offset, read);
+		read_len += read;
+		start_offset += read;
+		len -= read;
+		brelse(bh);
+		bh = NULL;
+	}
+	ret = read_len;
+
+out:
+	return ret;
+}
+
+static inline int ocfs2_get_xe_name_in_bucket(struct inode *inode,
+					      struct buffer_head *header_bh,
+					      struct ocfs2_xattr_entry *xe,
+					      char *xe_name)
+{
+	u16 start = le16_to_cpu(xe->xe_name_offset);
+	u16 len = xe->xe_name_len;
+
+	return ocfs2_get_range_in_bucket(inode, header_bh,
+					 start, len, xe_name);
+}
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+				   struct buffer_head *header_bh,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u16 *xe_index,
+				   int *found)
+{
+	int ret = 0, cmp = 1;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)header_bh->b_data;
+	size_t name_len = strlen(name);
+	u16 i, xe_count = le16_to_cpu(xh->xh_count);
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct buffer_head *xe_bh = NULL;
+	char *xe_name = NULL;
+
+	/*
+	 * We don't use binary search in the bucket because there
+	 * may be multiple entries with the same name hash.
+	 */
+	for (i = 0; i < xe_count; i++) {
+		xe = ocfs2_get_xe_in_bucket(inode, header_bh, &xe_bh, i);
+		if (!xe) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash))
+			goto next;
+		else if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+			brelse(xe_bh);
+			break;
+		}
+
+		cmp = name_index - xe->xe_type;
+		if (!cmp)
+			cmp = name_len - xe->xe_name_len;
+		if (cmp)
+			goto next;
+
+		/* now we have to compare the xattr name. */
+		xe_name = kzalloc(name_len, GFP_NOFS);
+		ret = ocfs2_get_xe_name_in_bucket(inode, header_bh,
+						  xe, xe_name);
+		if (ret != name_len) {
+			kfree(xe_name);
+			goto out;
+		}
+
+		cmp = memcmp(name, xe_name, name_len);
+		kfree(xe_name);
+		if (cmp == 0) {
+			*xe_index = i;
+			*found = 1;
+			break;
+		}
+next:
+		brelse(xe_bh);
+		xe_bh = NULL;
+	}
+
+	ret = cmp ? -ENODATA : 0;
+out:
+	return ret;
+}
+
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+				   u64 blkno,
+				   struct buffer_head **bhs,
+				   int new)
+{
+	int ret = 0;
+	u16 i, block_num = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	if (!new)
+		return ocfs2_read_blocks(OCFS2_SB(inode->i_sb), blkno,
+					 block_num, bhs,
+					 OCFS2_BH_CACHED, inode);
+
+	for (i = 0; i < block_num; i++) {
+		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
+		if (bhs[i] == NULL) {
+			ret = -EIO;
+			mlog_errno(ret);
+			break;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+	}
+
+	return ret;
+}
+
+static int ocfs2_cp_xattr_bucket_to_buffer(struct inode *inode,
+					   u64 blkno,
+					   char *buffer)
+{
+	int i, ret, block_num = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head **bhs = NULL;
+	char *target;
+
+	bhs = kzalloc(sizeof(struct buffer_head *) * block_num, GFP_NOFS);
+	ret = ocfs2_read_xattr_bucket(inode, blkno, bhs, 0);
+	if (ret)
+		goto out;
+
+	target = buffer;
+	for (i = 0; i < block_num; i++, target += blocksize)
+		memcpy(target, bhs[i]->b_data, blocksize);
+
+out:
+	if (bhs) {
+		for (i = 0; i < block_num; i++)
+			brelse(bhs[i]);
+		kfree(bhs);
+	}
+	return ret;
+}
+
+/*
+ * Find the specided xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_reserved1 of the first bucket contains
+ * the valid num of the buckets.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u64 p_blkno,
+				   u32 first_hash,
+				   u32 num_clusters,
+				   struct ocfs2_xattr_search *xs)
+{
+	int ret, found = 0;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *last_bh = NULL;
+	struct ocfs2_xattr_header *xh = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	u16 xh_count, xe_index;
+	u16 block_in_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int low_bucket = 0, bucket, high_bucket;
+	int blocksize = inode->i_sb->s_blocksize;
+	u32 last_hash;
+	u64 blkno;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno,
+			       &bh, OCFS2_BH_CACHED, inode);
+	if (ret)
+		goto out;
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	high_bucket = le16_to_cpu(xh->xh_reserved1) - 1;
+
+	while (low_bucket <= high_bucket) {
+		brelse(bh);
+		bh = last_bh = NULL;
+		bucket = (low_bucket + high_bucket) / 2;
+
+		blkno = p_blkno + bucket * block_in_bucket;
+
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+				       &bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xh = (struct ocfs2_xattr_header *)bh->b_data;
+		xe = &xh->xh_entries[0];
+		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+			high_bucket = bucket - 1;
+			continue;
+		}
+
+		/*
+		 * Check whether the hash of the last entry in our
+		 * bucket is larger than the search one.
+		 */
+		xh_count = le16_to_cpu(xh->xh_count);
+		xe = ocfs2_get_xe_in_bucket(inode, bh, &last_bh,
+					    xh_count - 1);
+		if (!xe) {
+			ret = -EIO;
+			goto out;
+		}
+
+		last_hash = le32_to_cpu(xe->xe_name_hash);
+		brelse(last_bh);
+		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+			low_bucket = bucket + 1;
+			continue;
+		}
+
+		/* the searched xattr should reside in this bucket if exists. */
+		ret = ocfs2_find_xe_in_bucket(inode, bh,
+					      name_index, name, name_hash,
+					      &xe_index, &found);
+		break;
+	}
+
+	/*
+	 * Record the bucket we have found.
+	 * Here the "header" is initialized first as the bh->b_data so
that
+	 * the set function can use it to find the insert place.
+	 */
+	xs->header_bh = bh;
+	xs->header = (struct ocfs2_xattr_header *)xs->header_bh->b_data;
+	bh = NULL;
+	xs->base = NULL;
+
+	/* alloc bucket and get the xattr attribute here. */
+	if (found) {
+		if (blocksize < OCFS2_XATTR_BUCKET_SIZE) {
+			xs->base = kmalloc(OCFS2_XATTR_BUCKET_SIZE,  GFP_NOFS);
+			if (!xs->base) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+			xs->alloc_base = 1;
+			ret = ocfs2_cp_xattr_bucket_to_buffer(inode,
+						xs->header_bh->b_blocknr,
+						xs->base);
+			if (ret)
+				goto out;
+		} else
+			xs->base = xs->header_bh->b_data;
+
+		xs->end = xs->base + OCFS2_XATTR_BUCKET_SIZE;
+		xs->here = &((struct ocfs2_xattr_header *)xs->base)->
+							xh_entries[xe_index];
+		mlog(0, "find xattr in bucket %llu, index = %u\n",
+		     (unsigned long long)xs->header_bh->b_blocknr, xe_index);
+	} else
+		ret = -ENODATA;
+
+out:
+	brelse(bh);
+	return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u64 p_blkno = 0;
+	u32 first_hash, num_clusters = 0;
+	u32 name_hash = ocfs2_xattr_hash_by_name(name_index, name);
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return -ENODATA;
+
+	mlog(0, "find xattr %s, hash = %u, index = %d in index block\n",
+	     name, name_hash, name_index);
+
+	ret = ocfs2_xattr_get_bucket(inode, name_hash, &p_blkno, &first_hash,
+				     &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash
"
+	     "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+
+	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+				      p_blkno, first_hash, num_clusters, xs);
+
+out:
+	return ret;
+}
+
+struct ocfs2_xattr_list {
+	char *buffer;
+	size_t buffer_size;
+};
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+				   struct buffer_head *header_bh,
+				   struct ocfs2_xattr_header *xh,
+				   void *para)
+{
+	int ret;
+	struct ocfs2_xattr_list *xl = (struct ocfs2_xattr_list *)para;
+
+	ret = ocfs2_xattr_list_entries(inode, xh,
+				       xl->buffer, xl->buffer_size);
+
+	if (ret < 0)
+		mlog_errno(ret);
+	else {
+		if (xl->buffer)
+			xl->buffer += ret;
+
+		xl->buffer_size -= ret;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+				       u64 blkno,
+				       u32 clusters,
+				       int (*func)(struct inode *inode,
+						struct buffer_head *header_bh,
+						struct ocfs2_xattr_header *xh,
+						void *para),
+				       void *para)
+{
+	int i, j, ret = 0, alloc_bucket = 0;
+	char *bucket = NULL, *buf;
+	struct ocfs2_xattr_header *xh;
+	int block_num = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+	u32 bucket_num = clusters * bpc;
+	struct buffer_head **bhs = NULL;
+	int blocksize = inode->i_sb->s_blocksize;
+
+	mlog(0, "iterating xattr buckets in %u clusters starting from
%llu\n",
+	     clusters, blkno);
+
+	bhs = kcalloc(block_num, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bhs)
+		return -ENOMEM;
+
+	if (block_num > 1) {
+		bucket = kmalloc(OCFS2_XATTR_BUCKET_SIZE,  GFP_NOFS);
+		if (!bucket) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		alloc_bucket = 1;
+	}
+
+	for (i = 0; i < bucket_num; i++, blkno += block_num) {
+		ret = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), blkno, block_num,
+					bhs, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (block_num > 1) {
+			buf = bucket;
+			for (j = 0; j < block_num; j++, buf += blocksize)
+				memcpy(buf, bhs[j]->b_data, blocksize);
+		} else
+			bucket = bhs[0]->b_data;
+
+		xh = (struct ocfs2_xattr_header *)bucket;
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			bucket_num = le16_to_cpu(xh->xh_reserved1);
+
+		mlog(0, "iterating xattr bucket %llu\n", blkno);
+		if (func) {
+			ret = func(inode, bhs[0], xh, para);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		for (j = 0; j < block_num; j++) {
+			brelse(bhs[j]);
+			bhs[j] = NULL;
+		}
+	}
+
+out:
+	for (j = 0; j < block_num; j++)
+		brelse(bhs[j]);
+	kfree(bhs);
+
+	if (alloc_bucket)
+		kfree(bucket);
+
+	return ret;
+}
+
+static int ocfs2_xattr_list_index_block(struct inode *inode,
+					struct ocfs2_xattr_tree_root *xt,
+					char *buffer,
+					size_t buffer_size)
+{
+	struct ocfs2_extent_list *el = &xt->xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+	u64 p_blkno;
+	struct ocfs2_xattr_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+	};
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_bucket(inode, name_hash, &p_blkno,
+					     &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_list_xattr_bucket,
+						  &xl);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+	ret = buffer_size - xl.buffer_size;
+out:
+	return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_hash = le32_to_cpu(l->xe_name_hash);
+	u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+	if (l_hash > r_hash)
+		return 1;
+	if (l_hash < r_hash)
+		return -1;
+	return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+	struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+	tmp = *l;
+	memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+	memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+					   struct buffer_head *xb_bh,
+					   struct buffer_head *xh_bh,
+					   struct buffer_head *data_bh)
+{
+	int i, blocksize = inode->i_sb->s_blocksize;
+	u16 offset, size, off_change;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_block *xb +				(struct ocfs2_xattr_block
*)xb_bh->b_data;
+	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_header *xh +				(struct ocfs2_xattr_header
*)xh_bh->b_data;
+	u16 count = le16_to_cpu(xb_xh->xh_count);
+	char *target = xh_bh->b_data, *src = xb_bh->b_data;
+
+	mlog(0, "cp xattr from block %llu to bucket %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr,
+	     (unsigned long long)xh_bh->b_blocknr);
+
+	xh->xh_count = xb_xh->xh_count;
+	xh->xh_reserved1 = cpu_to_le16(1);
+
+	/*
+	 * Since the xe_name_offset is based on ocfs2_xattr_header,
+	 * there is a offset change corresponding to the change of
+	 * ocfs2_xattr_header's position.
+	 */
+	off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	xe = &xb_xh->xh_entries[count-1];
+	offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+	size = blocksize - offset;
+	xh->xh_name_value_len = cpu_to_le16(size);
+	xh->xh_offset = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+	mlog(0, "copy name/value from %u to %u, size = %u\n", offset,
+	     le16_to_cpu(xh->xh_offset), size);
+	/* copy all the names and values. */
+	if (data_bh)
+		target = data_bh->b_data;
+	memcpy(target + offset, src + offset, size);
+
+	/* copy all the entries. */
+	target = xh_bh->b_data;
+	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+	size = count * sizeof(struct ocfs2_xattr_entry);
+	memcpy(target + offset, (char *)xb_xh + offset, size);
+
+	/* Change the xe offset for all the xe because of the move. */
+	off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+		 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	for (i = 0; i < count; i++)
+		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+	mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+	     offset, size, off_change);
+
+	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ */
+static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+					   struct ocfs2_xattr_search *xs,
+					   struct buffer_head *old_bh,
+					   struct buffer_head *new_bh)
+{
+	int ret;
+	char *buf = old_bh->b_data;
+	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+	int i, blocksize = inode->i_sb->s_blocksize;
+
+	xs->header_bh = new_bh;
+	get_bh(new_bh);
+	xs->header = (struct ocfs2_xattr_header *)xs->header_bh->b_data;
+
+	if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+		xs->base = kmalloc(OCFS2_XATTR_BUCKET_SIZE,  GFP_NOFS);
+		if (!xs->base)
+			return -ENOMEM;
+		xs->alloc_base = 1;
+		ret = ocfs2_cp_xattr_bucket_to_buffer(inode,
+						      new_bh->b_blocknr,
+						      xs->base);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+	} else
+		xs->base = new_bh->b_data;
+	xs->end = xs->base + OCFS2_XATTR_BUCKET_SIZE;
+
+	if (!xs->not_found) {
+		i = xs->here - old_xh->xh_entries;
+		xs->here = &((struct ocfs2_xattr_header *)xs->base)->
+								xh_entries[i];
+	}
+
 	return ret;
 }
 
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs)
+{
+	int ret, credits = OCFS2_SUBALLOC_ALLOC;
+	u32 bit_off, len;
+	u64 blkno;
+	handle_t *handle;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_alloc_context *data_ac;
+	struct buffer_head *xh_bh = NULL, *data_bh = NULL;
+	struct buffer_head *xb_bh = xs->xattr_bh;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xr;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+	u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	mlog(0, "create xattr index block for %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr);
+
+	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * this one should be changed to the sem which is used
+	 * in xattr allocation.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	/*
+	 * 3 more credits, one for xattr block update, one for the 1st block
+	 * of the new xattr bucket and one for the value/data.
+	 */
+	credits += 3;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_sem;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xb_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * The bucket may spread in many blocks, and
+	 * we will only touch the 1st block and the last block
+	 * in the whole bucket(one for entry and one for data.
+	 */
+	blkno = ocfs2_clusters_to_blocks(sb, bit_off);
+
+	mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+
+	ret = ocfs2_read_block(osb, blkno, &xh_bh,
+			       OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xh_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (bpb > 1) {
+		ret = ocfs2_read_block(osb, blkno + bpb - 1, &data_bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_journal_access(handle, inode, data_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+
+	ocfs2_journal_dirty(handle, xh_bh);
+	if (data_bh)
+		ocfs2_journal_dirty(handle, data_bh);
+
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+
+	/* Re-initalize the xattr block. */
+	xr = &xb->xb_attrs.xb_root;
+	memset(xr, 0, sizeof(struct ocfs2_xattr_tree_root));
+	xr->xt_clusters = cpu_to_le32(1);
+	xr->xt_last_eb_blk = 0;
+	xr->xt_list.l_tree_depth = 0;
+	xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+	xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+	memset(xr->xt_list.l_recs, 0, sizeof(struct ocfs2_extent_rec));
+	xr->xt_list.l_recs[0].e_cpos = 0;
+	xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_journal_dirty(handle, xb_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_sem:
+	up_write(&oi->ip_alloc_sem);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	brelse(xh_bh);
+	brelse(data_bh);
+
+	return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+	u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+	if (l_name_offset < r_name_offset)
+		return 1;
+	if (l_name_offset > r_name_offset)
+		return -1;
+	return 0;
+}
+
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     struct buffer_head *header_bh,
+				     char *bucket_buf,
+				     size_t *free)
+{
+	int ret, i;
+	size_t end, offset, len, value_len;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)header_bh->b_data;
+	u16 count = le16_to_cpu(xh->xh_count), val_start;
+	char *entries, *buf, *bucket = NULL;
+	u64 blkno = header_bh->b_blocknr;
+	u16 block_num = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	handle_t *handle;
+	struct buffer_head **bhs;
+	struct ocfs2_xattr_entry *xe;
+
+	mlog(0, "adjust xattr bucket in %llu, count = %u, "
+	     "xh_offset = %u, xh_name_value_len = %u.\n",
+	     blkno, count, le16_to_cpu(xh->xh_offset),
+	     le16_to_cpu(xh->xh_name_value_len));
+
+	bhs = kcalloc(block_num, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_blocks(osb, blkno, block_num, bhs,
+				OCFS2_BH_CACHED, inode);
+	if (ret)
+		goto out;
+
+	/*
+	 * In order to make the operation more efficient and generic,
+	 * we copy all the blocks into a contiguous memory.
+	 */
+	bucket = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+	if (!bucket) {
+		ret = -EIO;
+		goto out;
+	}
+
+	buf = bucket;
+	for (i = 0; i < block_num; i++, buf += blocksize)
+		memcpy(buf, bhs[i]->b_data, blocksize);
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), block_num);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < block_num; i++) {
+		ret = ocfs2_journal_access(handle, inode, bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto commit;
+		}
+	}
+
+	xh = (struct ocfs2_xattr_header *)bucket;
+	entries = (char *)xh->xh_entries;
+
+	/*
+	 * sort all the entries by their offset.
+	 * the largest will be the first, so that we can
+	 * move them to the end one by one.
+	 */
+	sort(entries, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe_offset, swap_xe);
+
+	/* Move all name/values to the end of the bucket. */
+	xe = xh->xh_entries;
+	end = OCFS2_XATTR_BUCKET_SIZE;
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+		offset = le16_to_cpu(xe->xe_name_offset);
+		if (xe->xe_local)
+			value_len = OCFS2_XATTR_SIZE(
+					le64_to_cpu(xe->xe_value_size));
+		else
+			value_len = OCFS2_XATTR_ROOT_SIZE;
+		len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+
+		/*
+		 * We must make sure that the xattr_value_root
+		 * exist in the same block. So adjust end to
+		 * the previous block end if needed.
+		 */
+		if (!xe->xe_local &&
+		    ((end -value_len) / blocksize !+			(end -1) / blocksize))
+			end = end -end % blocksize;
+
+		if (end > offset + len) {
+			val_start = end -len;
+			memmove(bucket + end -len, bucket + offset, len);
+			xe->xe_name_offset = cpu_to_le16(end -len);
+		}
+		end -= len;
+	}
+
+	BUG_ON(le16_to_cpu(xh->xh_offset) > end);
+
+	if (free)
+		*free += end -le16_to_cpu(xh->xh_offset);
+	if (le16_to_cpu(xh->xh_offset) == end)
+		goto commit;
+	xh->xh_offset = cpu_to_le16(end);
+
+	/* sort the entries by their name_hash. */
+	sort(entries, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+
+	buf = bucket;
+	for (i = 0; i < block_num; i++, buf += blocksize) {
+		memcpy(bhs[i]->b_data, buf, blocksize);
+		ocfs2_journal_dirty(handle, bhs[i]);
+	}
+
+	if (bucket_buf)
+		memcpy(bucket_buf, bucket, OCFS2_XATTR_BUCKET_SIZE);
+commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	for (i = 0; i < block_num; i++)
+		brelse(bhs[i]);
+
+	kfree(bhs);
+	kfree(bucket);
+	return ret;
+}
+
+/*
+ * Move half nums of the xattr bucket in the previous cluster to this new
+ * cluster. We only touch the last cluster of the previous extend record.
+ *
+ * first_bh and header_bh will be udpated if we move the data header_bh
+ * contains. first_hash will be set as the 1st xe's name_hash.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+					       handle_t *handle,
+					       struct buffer_head **first_bh,
+					       struct buffer_head **header_bh,
+					       u64 new_blkno,
+					       u64 prev_blkno,
+					       u32 num_clusters,
+					       u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int block_num = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int bucket_num = ocfs2_xattr_buckets_per_cluster(osb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+	struct ocfs2_xattr_header *new_xh;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)((*first_bh)->b_data);
+
+	BUG_ON(le16_to_cpu(xh->xh_reserved1) < bucket_num);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+
+	prev_bh = *first_bh;
+	get_bh(prev_bh);
+	xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+
+	prev_blkno += (num_clusters - 1) * block_num + block_num / 2;
+
+	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+	     prev_blkno, new_blkno);
+
+	/*
+	 * We need to update the 1st half of the cluster and
+	 * 1 more for the update of the 1st bucket of the previous
+	 * extent record.
+	 */
+	credits = block_num / 2 + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, prev_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < block_num / 2; i++, prev_blkno++, new_blkno++) {
+		old_bh = new_bh = NULL;
+		new_bh = sb_getblk(inode->i_sb, new_blkno);
+		if (!new_bh) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		ret = ocfs2_read_block(osb, prev_blkno,
+					&old_bh, OCFS2_BH_CACHED, inode);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+
+		if (i == 0) {
+			new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
+			new_xh->xh_reserved1 = cpu_to_le16(bucket_num / 2);
+			if (first_hash)
+				*first_hash = le32_to_cpu(
+					new_xh->xh_entries[0].xe_name_hash);
+			new_first_bh = new_bh;
+			get_bh(new_first_bh);
+		}
+
+		ocfs2_journal_dirty(handle, new_bh);
+
+		if (*header_bh == old_bh) {
+			brelse(*header_bh);
+			*header_bh = new_bh;
+			get_bh(*header_bh);
+
+			brelse(*first_bh);
+			*first_bh = new_first_bh;
+			get_bh(*first_bh);
+		}
+		brelse(new_bh);
+		brelse(old_bh);
+	}
+
+	le16_add_cpu(&xh->xh_reserved1, -(bucket_num / 2));
+
+	ocfs2_journal_dirty(handle, prev_bh);
+out:
+	brelse(new_first_bh);
+	return ret;
+}
+
+static int ocfs2_half_xattr_bucket(struct inode *inode,
+				   handle_t *handle,
+				   u64 blk,
+				   u64 new_blk,
+				   u32 *first_hash,
+				   int new_bucket_head)
+{
+	int ret, i;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 count, start, len, name_value_len, xe_len, name_offset;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	char *bucket = NULL, *buffer;
+	int blocksize = inode->i_sb->s_blocksize;
+
+	mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+	     blk, new_blk);
+
+	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!s_bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_blocks(osb, blk, blk_per_bucket, s_bhs,
+				OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!t_bhs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_blocks(osb, new_blk, blk_per_bucket, t_bhs,
+				OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * In order to simplify the process, we copy the source bucket to a
+	 * buffer first, adjust it and then copy it to the dest.
+	 */
+	bucket = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+	if (!bucket) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	buffer = bucket;
+	for (i = 0; i < blk_per_bucket; i++, buffer += blocksize)
+		memcpy(buffer, s_bhs[i]->b_data, blocksize);
+
+	xh = (struct ocfs2_xattr_header *)bucket;
+	count = le16_to_cpu(xh->xh_count);
+	start = count / 2;
+
+	/*
+	 * Calculate the total name/value len and xh_offset for
+	 * the source bucket first.
+	 */
+	name_offset = OCFS2_XATTR_BUCKET_SIZE;
+	name_value_len = 0;
+	for (i = 0; i < start; i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (le64_to_cpu(xe->xe_value_size) > OCFS2_XATTR_INLINE_SIZE)
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		else
+			xe_len ++			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		name_value_len += xe_len;
+		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+			name_offset = le16_to_cpu(xe->xe_name_offset);
+	}
+
+	/*
+	 * Now begin the modification to the dest bucket.
+	 *
+	 * In the dest bucket, We just move the xattr entry to the beginning
+	 * and don't touch the name/value. So there will be some holes in the
+	 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+	 * called.
+	 */
+	xe = &xh->xh_entries[start];
+	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+	mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+		(char *)xe - bucket, (char *)xh->xh_entries - bucket);
+	memmove((char *)xh->xh_entries, (char *)xe, len);
+	xe = &xh->xh_entries[count - start];
+	len = sizeof(struct ocfs2_xattr_entry) * start;
+	memset((char *)xe, 0, len);
+
+	le16_add_cpu(&xh->xh_count, -start);
+	le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+	/* Calculate xh_offset for the new bucket. */
+	xh->xh_offset = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (le64_to_cpu(xe->xe_value_size) > OCFS2_XATTR_INLINE_SIZE)
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		else
+			xe_len ++			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		if (le16_to_cpu(xe->xe_name_offset) <
+		    le16_to_cpu(xh->xh_offset))
+			xh->xh_offset = xe->xe_name_offset;
+	}
+
+	/* set xh->xh_reserved1 for the new xh. */
+	if (new_bucket_head)
+		xh->xh_reserved1 = cpu_to_le16(1);
+	else
+		xh->xh_reserved1 = 0;
+
+	buffer = bucket;
+	for (i = 0; i < blk_per_bucket; i++, buffer += blocksize) {
+		memcpy(t_bhs[i]->b_data, buffer, blocksize);
+		ocfs2_journal_dirty(handle, s_bhs[0]);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+	/* store the first_hash of the new bucket. */
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+	/* Now update the source bucket. */
+	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	xh->xh_count = cpu_to_le16(start);
+	xh->xh_offset = cpu_to_le16(name_offset);
+	xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+	ocfs2_journal_dirty(handle, s_bhs[0]);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	kfree(bucket);
+
+	return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+				 handle_t *handle,
+				 u64 s_blkno,
+				 u64 t_blkno,
+				 int t_is_new)
+{
+	int ret, i;
+	int block_num = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+
+	BUG_ON(s_blkno == t_blkno);
+
+	mlog(0, "cp bucket %llu to %llu, target is %d\n",
+	     s_blkno, t_blkno, t_is_new);
+
+	s_bhs = kzalloc(sizeof(struct buffer_head *) * block_num, GFP_NOFS);
+	ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+	if (ret)
+		goto out;
+
+	t_bhs = kzalloc(sizeof(struct buffer_head *) * block_num, GFP_NOFS);
+	ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < block_num; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret)
+			goto out;
+	}
+
+	for (i = 0; i < block_num; i++) {
+		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+		ocfs2_journal_dirty(handle, t_bhs[i]);
+	}
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < block_num; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < block_num; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	return ret;
+}
+
+/*
+ * Copy one xattr cluster from src_blk to to_blk.
+ * The to_blk will become the first bucket header of the cluster, so its
+ * xh_reserved1 will be initialized as the bucket num in the cluster.
+ */
+static int ocfs2_cp_xattr_cluster(struct inode *inode,
+				  handle_t *handle,
+				  struct buffer_head *first_bh,
+				  u64 src_blk,
+				  u64 to_blk,
+				  u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int block_num = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int bucket_num = ocfs2_xattr_buckets_per_cluster(osb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_header *xh;
+
+	mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+
+	/*
+	 * We need to update the new cluster and 1 more for the update of
+	 * the 1st bucket of the previous extent rec.
+	 */
+	credits = block_num + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < bucket_num; i++) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    src_blk, to_blk, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	}
+
+	/* update the old bucket header. */
+	xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+	le16_add_cpu(&xh->xh_reserved1, -bucket_num);
+
+	ocfs2_journal_dirty(handle, first_bh);
+
+	/* update the new bucket header. */
+	to_blk -= block_num;
+	ret = ocfs2_read_block(osb, to_blk, &bh, OCFS2_BH_CACHED, inode);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	xh->xh_reserved1 = cpu_to_le16(bucket_num);
+
+	ocfs2_journal_dirty(handle, bh);
+
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * Move half of the xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static inline int ocfs2_half_xattr_cluster(struct inode *inode,
+					   handle_t *handle,
+					   u64 prev_blk,
+					   u64 new_blk,
+					   u32 *first_hash)
+{
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE <
OCFS2_SB(inode->i_sb)->s_clustersize);
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+					new_blk, first_hash, 1);
+}
+
+/*
+ * This is a new separate cluster, we will move some xattrs in the previous
+ * cluster to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and
header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has
more
+ *    than 1 bucket, so just move half nums of bucket into the new cluster and
+ *    update the first_bh and header_bh if the insert bucket has been moved
+ *    to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ *    a) If the previous extent rec has more than one cluster and the insert
+ *       place isn't in the last cluster, copy the entire last cluster to
the
+ *       new one. This time, we don't need to upate the first_bh and
header_bh
+ *       since they will not be moved into the new cluster.
+ *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ *       the new one. And we set the extend flag to zero if the insert place is
+ *       moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+					    handle_t *handle,
+					    struct buffer_head **first_bh,
+					    struct buffer_head **header_bh,
+					    u64 new_blk,
+					    u64 prev_blk,
+					    u32 prev_clusters,
+					    u32 *v_start,
+					    int *extend)
+{
+	int ret = 0;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+	     prev_blk, prev_clusters, new_blk);
+
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+							  handle,
+							  first_bh,
+							  header_bh,
+							  new_blk,
+							  prev_blk,
+							  prev_clusters,
+							  v_start);
+	else {
+		u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+
+		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+			ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+						     last_blk, new_blk,
+						     v_start);
+		else {
+			ret = ocfs2_half_xattr_cluster(inode, handle,
+						       last_blk, new_blk,
+						       v_start);
+
+			if ((*header_bh)->b_blocknr == last_blk && extend)
+				*extend = 0;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ *
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+				       struct buffer_head *root_bh,
+				       struct buffer_head **first_bh,
+				       struct buffer_head **header_bh,
+				       u32 *num_clusters,
+				       u32 prev_cpos,
+				       u64 prev_blkno,
+				       int *extend)
+{
+	int ret, credits;
+	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 prev_clusters = *num_clusters;
+	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+	u64 block;
+	handle_t *handle = NULL;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_header *first_xh +			(struct ocfs2_xattr_header
*)(*first_bh)->b_data;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *root_el = &xb_root->xt_list;
+	enum ocfs2_extent_tree_type type = OCFS2_XATTR_TREE_EXTENT;
+
+	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+	     "previous xattr blkno = %llu\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     prev_cpos, prev_blkno);
+
+	ret = ocfs2_lock_allocators(inode, root_bh, root_el,
+				    clusters_to_add, 0, &data_ac,
+				    &meta_ac, type, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, root_el, clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				     clusters_to_add, &bit_off, &num_bits);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for xattr in inode
%llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (prev_blkno + prev_clusters * bpc == block &&
+	    le16_to_cpu(first_xh->xh_reserved1) +
+	    num_bits * ocfs2_xattr_buckets_per_cluster(osb) >
+	    le16_to_cpu(first_xh->xh_reserved1)) {
+		/*
+		 * If this cluster is contiguous with the old one and
+		 * adding this new cluster, we don't surpass the limit of
+		 * xh_reserved1, cool. We will let it be initialized
+		 * and used like other buckets in the previous cluster.
+		 * So add it as a contiguous one. The caller will handle
+		 * its init process.
+		 */
+		v_start = prev_cpos + prev_clusters;
+		*num_clusters = prev_clusters + clusters_to_add;
+		mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
+		     clusters_to_add);
+	} else {
+		ret = ocfs2_adjust_xattr_cross_cluster(inode,
+						       handle,
+						       first_bh,
+						       header_bh,
+						       block,
+						       prev_blkno,
+						       prev_clusters,
+						       &v_start,
+						       extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+
+	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+	     num_bits, block, v_start);
+	ret = ocfs2_insert_extent(osb, handle, inode, root_bh,
+				  v_start, block, num_bits,
+				  0, meta_ac, type, NULL);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+leave:
+	if (handle) {
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+
+	brelse(new_bh);
+	mlog_exit(ret);
+	return ret;
+}
+
+/*
+ * Extend a new xattr bucket and mv the data to the end one by one.
+ * for the start_bh, move half of the xattr to the bucket after it.
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+				     struct buffer_head *first_bh,
+				     struct buffer_head *start_bh,
+				     u32 num_clusters)
+{
+	int ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u64 start_blk = start_bh->b_blocknr, end_blk;
+	u32 bucket_num = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+	handle_t *handle;
+	struct ocfs2_xattr_header *first_xh +				(struct ocfs2_xattr_header
*)first_bh->b_data;
+	u16 bucket = le16_to_cpu(first_xh->xh_reserved1);
+
+	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+	     "from %llu, len = %u\n", start_blk,
+	     (unsigned long long)first_bh->b_blocknr, num_clusters);
+
+	BUG_ON(bucket >= bucket_num);
+
+	end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+
+	/*
+	 * We will touch all the buckets after the start_bh(include it).
+	 * Add one more bucket and modify the first_bh.
+	 */
+	credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto commit;
+	}
+
+	while (end_blk != start_blk) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+					    end_blk + blk_per_bucket, 0);
+		if (ret)
+			goto commit;
+		end_blk -= blk_per_bucket;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+				      start_blk + blk_per_bucket, NULL, 0);
+
+	le16_add_cpu(&first_xh->xh_reserved1, 1);
+	ocfs2_journal_dirty(handle, first_bh);
+
+commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Add new xattr bucket in a extent record and adjust the buckets accordingly.
+ * xb_bh is the ocfs2_xattr_block and header_bh is one header of a bucket.
+ * We will move all the buckets starting from it to the next place. As for
+ * this one, half of its xattr will be moved to the next one.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+				      struct buffer_head *xb_bh,
+				      struct buffer_head *header_bh)
+{
+	struct ocfs2_xattr_header *first_xh = NULL;
+	struct buffer_head *first_bh = NULL;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)header_bh->b_data;
+	u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int ret, bucket_num, extend = 1;
+	u64 p_blkno;
+	u32 e_cpos, num_clusters;
+
+	mlog(0, "Add new xattr bucket starting form %llu\n",
+	     (unsigned long long)header_bh->b_blocknr);
+	ret = ocfs2_xattr_get_bucket(inode, name_hash, &p_blkno, &e_cpos,
+				     &num_clusters, el);
+	if (ret)
+		goto out;
+
+	ret = ocfs2_read_block(osb, p_blkno,
+				&first_bh, OCFS2_BH_CACHED, inode);
+	if (ret)
+		goto out;
+
+	bucket_num = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+
+	if (bucket_num == le16_to_cpu(first_xh->xh_reserved1)) {
+		ret = ocfs2_add_new_xattr_cluster(inode,
+						  xb_bh,
+						  &first_bh,
+						  &header_bh,
+						  &num_clusters,
+						  e_cpos,
+						  p_blkno,
+						  &extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (extend)
+		ret = ocfs2_extend_xattr_bucket(inode,
+						first_bh,
+						header_bh,
+						num_clusters);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(first_bh);
+	return ret;
+}
+
+static void ocfs2_xattr_set_entry_normal(struct inode *inode,
+					 char *bucket,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 u32 name_hash,
+					 int local,
+					 int *is_empty)
+{
+	struct ocfs2_xattr_entry *last, *xe;
+	int name_len = strlen(xi->name);
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+	u16 count = le16_to_cpu(xh->xh_count);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	void *val;
+	size_t offs, size, new_size;
+
+	last = &xh->xh_entries[count];
+	if (!xs->not_found) {
+		xe = xs->here;
+		offs = le16_to_cpu(xe->xe_name_offset);
+		val = xs->base + offs;
+		if (xe->xe_local)
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		/*
+		 * If the new value will be stored outside, xi->value has been
+		 * initalized as an empty ocfs2_xattr_value_root, and the same
+		 * goes with xi->value_len, so we can set new_size safely here.
+		 * See ocfs2_xattr_set_in_bucket.
+		 */
+		new_size = OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len);
+
+		le16_add_cpu(&xh->xh_name_value_len, -size);
+		if (xi->value) {
+			if (new_size > size)
+				goto set_new_name_value;
+
+			/*
+			 * We must make sure that the xattr_value_root exist in
+			 * the same block and if the old place doesn't meet with
+			 * our need, we have to alloc a new space in the bucket.
+			 */
+			if (!local && offs / blocksize !+				      (offs + new_size - 1) /
blocksize)
+					goto set_new_name_value;
+
+			/* Now replace the old value with new one. */
+			if (local)
+				xe->xe_value_size = cpu_to_le64(xi->value_len);
+			else
+				xe->xe_value_size = 0;
+
+			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
+			       size - OCFS2_XATTR_SIZE(name_len));
+			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+				memcpy(val + OCFS2_XATTR_SIZE(name_len),
+				       xi->value, xi->value_len);
+
+			le16_add_cpu(&xh->xh_name_value_len, new_size);
+			xe->xe_local = local;
+			return;
+		} else {
+			/* Remove the old entry. */
+			last -= 1;
+			memmove(xe, xe + 1,
+				(void *)last - (void *)xe);
+			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+			le16_add_cpu(&xh->xh_count, -1);
+			if (xh->xh_count == 0 && is_empty)
+				*is_empty = 1;
+			return;
+		}
+	} else {
+		/* find a new entry for insert. */
+		int low = 0, high = count - 1, tmp;
+		struct ocfs2_xattr_entry *tmp_xe;
+
+		while (low <= high) {
+			tmp = (low + high) / 2;
+			tmp_xe = &xh->xh_entries[tmp];
+
+			if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+				low = tmp + 1;
+			else if (name_hash <
+				 le32_to_cpu(tmp_xe->xe_name_hash))
+				high = tmp - 1;
+			else
+				break;
+		}
+
+		xe = &xh->xh_entries[low];
+		if (low != count)
+			memmove(xe + 1, xe, (void *)last - (void *)xe);
+
+		le16_add_cpu(&xh->xh_count, 1);
+		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
+		xe->xe_name_hash = cpu_to_le32(name_hash);
+		xe->xe_name_len = name_len;
+		xe->xe_type = xi->name_index;
+	}
+
+set_new_name_value:
+	/* Insert the new name+value. */
+	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+	/*
+	 * We must make sure that the xattr_value_root
+	 * exist in the same block.
+	 */
+	offs = le16_to_cpu(xh->xh_offset);
+	if (!local) {
+		u16 val_start = offs - OCFS2_XATTR_ROOT_SIZE;
+
+		if (val_start >> inode->i_sb->s_blocksize_bits !+		    (offs - 1)
>> inode->i_sb->s_blocksize_bits) {
+			offs = offs - offs % blocksize;
+			xh->xh_offset = cpu_to_le16(offs);
+		}
+	}
+	val = xs->base + offs - size;
+	xe->xe_name_offset = cpu_to_le16(offs - size);
+
+	memset(val + OCFS2_XATTR_SIZE(name_len) - OCFS2_XATTR_PAD,
+	       0, OCFS2_XATTR_PAD);
+	memcpy(val, xi->name, name_len);
+
+	memset(val + size - OCFS2_XATTR_PAD, 0, OCFS2_XATTR_PAD);
+	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+	xe->xe_value_size = cpu_to_le64(xi->value_len);
+	xe->xe_local = local;
+	xs->here = xe;
+	le16_add_cpu(&xh->xh_offset, -size);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+	return;
+}
+
+static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
+					     handle_t *handle,
+					     struct ocfs2_xattr_search *xs,
+					     struct buffer_head **bhs,
+					     u16 bh_num)
+{
+	int ret = 0, i, len, off, block_off, block_end;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	struct ocfs2_xattr_header *xh +				(struct ocfs2_xattr_header *)xs->base;
+	u16 xh_count = le16_to_cpu(xh->xh_count);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	char touched[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+	memset(touched, 0, sizeof(touched));
+
+	/*
+	 * First calculate all the blocks we should journal_access
+	 * and journal_dirty. The first block should always be touched.
+	 */
+	touched[0] = 1;
+
+	/* calc the data first. */
+	off = le16_to_cpu(xe->xe_name_offset);
+	block_off = off >> inode->i_sb->s_blocksize_bits;
+	len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+	if (xe->xe_local)
+		len += OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+	else
+		len += OCFS2_XATTR_ROOT_SIZE;
+	off += len - 1;
+	block_end = off >> inode->i_sb->s_blocksize_bits;
+	for (i = block_off; i <= block_end; i++)
+		touched[i] = 1;
+
+	/* Now the xe_entry. */
+	off = (char *)xe - (char *)xh;
+	block_off = off >> inode->i_sb->s_blocksize_bits;
+	len = ((char *)&xh->xh_entries[xh_count]) - (char *)xe;
+	block_end = (off + len - 1) >> inode->i_sb->s_blocksize_bits;
+	for (i = block_off; i <= block_end; i++)
+		touched[i] = 1;
+
+	for (i = 0; i < bh_num; i++) {
+		if (!touched[i])
+			continue;
+
+		ret = ocfs2_journal_access(handle, inode, bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	for (i = 0; i < bh_num; i++) {
+		if (!touched[i])
+			continue;
+
+		memcpy(bhs[i]->b_data, xs->base + i * blocksize, blocksize);
+		ret = ocfs2_journal_dirty(handle, bhs[i]);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+					   struct ocfs2_xattr_info *xi,
+					   struct ocfs2_xattr_search *xs,
+					   u32 name_hash,
+					   int local,
+					   int *bucket_empty)
+{
+	int i, ret;
+	handle_t *handle = NULL;
+	struct buffer_head **bhs = NULL;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 blk = xs->header_bh->b_blocknr;
+	char *buf;
+
+	mlog(0, "Set xattr entry len = %d index = %d in bucket %llu\n",
+	     xi->value_len, xi->name_index, blk);
+
+	bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_blocks(osb, blk, blk_per_bucket,
+				bhs, OCFS2_BH_CACHED, inode);
+	if (ret)
+		goto out;
+
+	if (!xs->base) {
+		/* we should already set xs->base if we have found the xattr. */
+		BUG_ON(!xs->not_found);
+
+		if (blocksize < OCFS2_XATTR_BUCKET_SIZE) {
+			/*
+			 * This is a new entry and we haven't find it before,
+			 * So base isn't set in entry_find.
+			 */
+			xs->base = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+			if (!xs->base) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			xs->alloc_base = 1;
+
+			buf = xs->base;
+			for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+				memcpy(buf, bhs[i]->b_data, blocksize);
+		} else
+			xs->base = bhs[0]->b_data;
+	}
+
+	handle = ocfs2_start_trans(osb, blk_per_bucket);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_xattr_set_entry_normal(inode, xs->base, xi, xs,
+				     name_hash, local, bucket_empty);
+
+	/*Only_access and dirty the blocks we have touched in set xattr. */
+	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
+						bhs, blk_per_bucket);
+	if (ret)
+		mlog_errno(ret);
+out:
+	ocfs2_commit_trans(osb, handle);
+
+	if (bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(bhs[i]);
+		kfree(bhs);
+	}
+
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_xattr_value_update_size(struct inode *inode,
+				    struct buffer_head *xe_bh,
+				    struct ocfs2_xattr_entry *xe,
+				    u64 new_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, 1);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xe->xe_value_size = cpu_to_le64(new_size);
+
+	ret = ocfs2_journal_dirty(handle, xe_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+					     struct buffer_head *header_bh,
+					     int xe_off,
+					     int len,
+					     char *new_xe,
+					     char *new_xv)
+{
+	int ret, offset;
+	u64 value_blk;
+	struct buffer_head *value_bh = NULL, *xe_bh = NULL;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_header *xh +			(struct ocfs2_xattr_header
*)header_bh->b_data;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	if (blocksize == OCFS2_XATTR_BUCKET_SIZE) {
+		xe_bh = header_bh;
+		get_bh(xe_bh);
+		xe = &xh->xh_entries[xe_off];
+	} else {
+		xe = ocfs2_get_xe_in_bucket(inode, header_bh,
+					    &xe_bh, xe_off);
+		if (!xe) {
+			ret = -EIO;
+			goto out;
+		}
+		if (!xe_bh) {
+			xe_bh = header_bh;
+			get_bh(xe_bh);
+		}
+	}
+
+	BUG_ON(!xe || xe->xe_local);
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	value_blk = offset / blocksize;
+
+	/* We don't allow ocfs2_xattr_value to be stored in different block. */
+	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+	value_blk += header_bh->b_blocknr;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), value_blk,
+			       &value_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xv = (struct ocfs2_xattr_value_root *)
+		(value_bh->b_data + offset % blocksize);
+
+	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+	     xe_off, (unsigned long long)header_bh->b_blocknr, len);
+	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_value_update_size(inode, xe_bh, xe, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (new_xe && new_xe != (char *)xe)
+		memcpy(new_xe, xe, sizeof(struct ocfs2_xattr_entry));
+	if (new_xv && new_xv != (char *)xv)
+		memcpy(new_xv, xv, OCFS2_XATTR_ROOT_SIZE);
+out:
+	brelse(xe_bh);
+	brelse(value_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						int len)
+{
+	int ret, offset;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+	u16 val_offset = le16_to_cpu(xe->xe_name_offset) +
+			 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	BUG_ON(!xs->base || !xe || xe->xe_local);
+
+	offset = xe - xh->xh_entries;
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->header_bh,
+						offset, len, (char *)xe,
+						xs->base + val_offset);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						char *val,
+						int value_len)
+{
+	int offset;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe = xs->here;
+
+	BUG_ON(!xs->base || !xe || xe->xe_local);
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+
+	return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+}
+
+/*
+ * Remove the xattr bucket pointed by bucket_bh.
+ * All the buckets after it in the same xattr extent rec will be
+ * move forward one by one.
+ */
+static int ocfs2_rm_xattr_bucket(struct inode *inode,
+				 struct buffer_head *first_bh,
+				 struct buffer_head *bucket_bh)
+{
+	int ret = 0, credits;
+	struct ocfs2_xattr_header *xh +				(struct ocfs2_xattr_header
*)first_bh->b_data;
+	u16 bucket_num = le16_to_cpu(xh->xh_reserved1);
+	u64 end, start = bucket_bh->b_blocknr;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	end = first_bh->b_blocknr + (bucket_num - 1) * blk_per_bucket;
+
+	mlog(0, "rm xattr bucket %llu\n",
+	     (unsigned long long)bucket_bh->b_blocknr);
+	/*
+	 * We need to update the first xattr_header and all the buckets starting
+	 * from start in this xattr rec.
+	 *
+	 * XXX: Should we empty the old last bucket here?
+	 */
+	credits = 1 + end -start;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+
+	while (start < end) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    start + blk_per_bucket,
+					    start, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		start += blk_per_bucket;
+	}
+
+	/* update the first_bh. */
+	xh->xh_reserved1 = cpu_to_le16(bucket_num - 1);
+	ocfs2_journal_dirty(handle, first_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+	return ret;
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)root_bh->b_data;
+	struct ocfs2_extent_list *root_el = &xb->xb_attrs.xb_root.xt_list;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+	     cpos, len, (unsigned long long)blkno);
+
+	ret = ocfs2_lock_allocators(inode, root_bh, root_el,
+				    0, 1, NULL, &meta_ac,
+				    OCFS2_XATTR_TREE_EXTENT, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(inode, root_bh, cpos, len, handle, meta_ac,
+				  &dealloc, OCFS2_XATTR_TREE_EXTENT, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_shrink(struct inode *inode,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xs,
+				     u32 name_hash)
+{
+	int ret;
+	u32 e_cpos, num_clusters;
+	u64 p_blkno;
+	struct buffer_head *first_bh = NULL;
+	struct ocfs2_xattr_header *first_xh;
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block
*)xs->xattr_bh->b_data;
+
+	BUG_ON(xs->header->xh_count != 0);
+
+	ret = ocfs2_xattr_get_bucket(inode, name_hash, &p_blkno,
+				     &e_cpos, &num_clusters,
+				     &xb->xb_attrs.xb_root.xt_list);
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno,
+			       &first_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_rm_xattr_bucket(inode, first_bh, xs->header_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+	if (first_xh->xh_reserved1 == 0)
+		ret = ocfs2_rm_xattr_cluster(inode, xs->xattr_bh,
+					     p_blkno, e_cpos,
+					     num_clusters);
+
+out:
+	brelse(first_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_set_in_bucket(struct inode *inode,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret, local = 1, bucket_empty = 0;
+	size_t value_len;
+	char *val = (char *)xi->value;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	u32 name_hash = ocfs2_xattr_hash_by_name(xi->name_index, xi->name);
+
+	if (!xs->not_found && !xe->xe_local) {
+		/*
+		 * We need to truncate the xattr storage first.
+		 *
+		 * If both the old and new value are stored to
+		 * outside block, we only need to truncate
+		 * the storage and then set the value outside.
+		 *
+		 * If the new value should be stored within block,
+		 * we should free all the outside block first and
+		 * the modification to the xattr block will be done
+		 * by following steps.
+		 */
+		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+			value_len = xi->value_len;
+		else
+			value_len = 0;
+
+		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+							   value_len);
+		if (ret)
+			goto out;
+
+		if (value_len)
+			goto set_value_outside;
+	}
+
+	value_len = xi->value_len;
+	/* So we have to handle the inside block change now. */
+	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/*
+		 * If the new value will be stored outside of block,
+		 * initalize a new empty value root and insert it first.
+		 */
+		local = 0;
+		xi->value = &def_xv;
+		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+	}
+
+	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash,
+					      local, &bucket_empty);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* allocate the space now for the outside block storage. */
+		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+							   value_len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else {
+		if (bucket_empty)
+			ret = ocfs2_xattr_bucket_shrink(inode, xi,
+							xs, name_hash);
+		goto out;
+	}
+
+set_value_outside:
+	ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+out:
+	return ret;
+}
+
+/* check whether the xattr bucket is filled up with the same hash value. */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+					      struct buffer_head *header_bh)
+{
+	int ret = 0;
+	struct ocfs2_xattr_header *xh +				(struct ocfs2_xattr_header
*)header_bh->b_data;
+	u16 count = le16_to_cpu(xh->xh_count);
+	struct buffer_head *xe_bh = NULL;
+	struct ocfs2_xattr_entry *xe;
+
+	xe = ocfs2_get_xe_in_bucket(inode, header_bh, &xe_bh, count - 1);
+	if (!xe)
+		return -EIO;
+
+	if (xe->xe_name_hash == xh->xh_entries[0].xe_name_hash) {
+		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+		     "hash = %u\n", (unsigned long long)header_bh->b_blocknr,
+		      le32_to_cpu(xe->xe_name_hash));
+		ret = -ENOSPC;
+	}
+
+	brelse(xe_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	u16 count, header_size;
+	size_t free, max_free, need, old;
+	size_t value_size = 0, name_len = strlen(xi->name);
+	int ret, allocation = 0, new_outside = 0;
+
+	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+
+try_again:
+	xh = xs->header;
+	count = le16_to_cpu(xh->xh_count);
+	header_size = sizeof(struct ocfs2_xattr_header) +
+			count * sizeof(struct ocfs2_xattr_entry);
+	free = le16_to_cpu(xh->xh_offset) - header_size;
+	max_free = OCFS2_XATTR_BUCKET_SIZE -
+		le16_to_cpu(xh->xh_name_value_len) - header_size;
+
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		new_outside = 1;
+		value_size = OCFS2_XATTR_ROOT_SIZE;
+	} else if (xi->value)
+		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+	if (xs->not_found)
+		need = sizeof(struct ocfs2_xattr_entry) +
+			OCFS2_XATTR_SIZE(name_len) + value_size;
+	else {
+		need = value_size + OCFS2_XATTR_SIZE(name_len);;
+
+		/*
+		 * We only replace the old value if the new length is smaller
+		 * than the old one. Otherwise we will allocate new space in the
+		 * bucket to store it.
+		 *
+		 * If the new value will be stored outside and the old value
+		 * is an in-bucket xattr, there are some cases that old space
+		 * isn't suitable(e.g, the space is cross-block and the new
+		 * xattr value root can't be stored in the same block),
+		 * so calculate "need" in this case.
+		 */
+		xe = xs->here;
+		if (xe->xe_local)
+			old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		if (old >= value_size && (!new_outside || !xe->xe_local))
+			need = 0;
+	}
+
+	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
+	     "need = %d, max_free = %d\n", xs->not_found,
+	     (unsigned long long)xs->header_bh->b_blocknr,
+	     free, need, max_free);
+
+	if (free < need) {
+		if (need <= max_free) {
+			/*
+			 * We can create the space by packing. Since only the
+			 * name/value will be moved, the xe shouldn't be changed
+			 * in xs.
+			 */
+			ret = ocfs2_defrag_xattr_bucket(inode, xs->header_bh,
+							xs->base, &free);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			if (free >= need)
+				goto xattr_set;
+
+			mlog(0, "Can't get enough space for xattr insert by "
+			     "defragment. Need %u bytes, but we have %d, so "
+			     "allocate new clusters for it.\n", need, free);
+		}
+
+		/*
+		 * We have to add new buckets or clusters and one
+		 * allocation should leave us enough space for insert.
+		 */
+		BUG_ON(allocation);
+
+		/*
+		 * We do not allow for overlapping ranges between buckets. And
+		 * the maximum number of collisions we will allow for then is
+		 * one bucket's worth, so check it here whether we need to
+		 * add a new bucket for the insert.
+		 */
+		ret = ocfs2_check_xattr_bucket_collision(inode, xs->header_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_add_new_xattr_bucket(inode,
+						 xs->xattr_bh,
+						 xs->header_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		brelse(xs->header_bh);
+		xs->header_bh = NULL;
+		if (xs->alloc_base) {
+			kfree(xs->base);
+			xs->base = NULL;
+			xs->alloc_base = 0;
+		}
+		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+						   xi->name_index,
+						   xi->name, xs);
+		if (ret && ret != -ENODATA)
+			goto out;
+		xs->not_found = ret;
+		allocation = 1;
+		goto try_again;
+	}
+
+xattr_set:
+	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct buffer_head *header_bh,
+					struct ocfs2_xattr_header *xh,
+					void *para)
+{
+	int ret = 0;
+	u16 i, count = le16_to_cpu(xh->xh_count);
+	struct ocfs2_xattr_entry *xe;
+
+
+	for (i = 0; i < count; i++) {
+		xe = &xh->xh_entries[i];
+		if (xe->xe_local)
+			continue;
+
+		ret = ocfs2_xattr_bucket_value_truncate(inode,
+							header_bh,
+							i, 0,
+							NULL,
+							NULL);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh)
+{
+	struct ocfs2_xattr_block *xb +			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+	u64 p_blkno;
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_bucket(inode, name_hash, &p_blkno,
+					     &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_delete_xattr_in_bucket,
+						  NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
+					     p_blkno, e_cpos, num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+out:
+	return ret;
+}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 6e17242..84484ca 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -56,7 +56,9 @@ struct ocfs2_xattr_header {
 	__le16	xh_count;
 	__le16	xh_reserved1;
 	__le32	xh_csum;
-	__le16  xh_reserved2[4];
+	__le16  xh_offset;
+	__le16  xh_name_value_len;
+	__le16  xh_reserved2[2];
 	struct ocfs2_xattr_entry	xh_entries[0];
 };
 
@@ -119,4 +121,5 @@ extern int ocfs2_xattr_remove(struct inode *inode, struct
buffer_head *di_bh);
 
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
+void ocfs2_init_xattr_value_root(void);
 #endif /* OCFS2_XATTR_H */
-- 
1.5.4.GIT

Ocfs2 devel - Jun 2008 - [PATCH 0/8] ocfs2: Add extended attributes for ocfs2. V1

[Ocfs2-devel] [PATCH 0/8] ocfs2: Add extended attributes for ocfs2. V1

[Ocfs2-devel] [PATCH 7/8] ocfs2: Add extended attributes support. v1

[Ocfs2-devel] [PATCH 1/8] Modify ocfs2_num_free_extents for future xattr usage.v1

[Ocfs2-devel] [PATCH 2/8] Use ocfs2_extent_list instead of ocfs2_dinode.v1

[Ocfs2-devel] [PATCH 3/8] Make ocfs2_lock_allocators generic for extent allocation.v1

[Ocfs2-devel] [PATCH 4/8] Make extend allocation generic.v1

[Ocfs2-devel] [PATCH 5/8] Add xattr header in ocfs2.v1

[Ocfs2-devel] [PATCH 6/8] Add extent tree operation for xattr value.v1

[Ocfs2-devel] [PATCH 8/8] Add large numbers of extended attributes support for ocfs2.v1