Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 01/42] ocfs2: Reused freed extent block in b-tree operation.
In some b-tree operations we may have the chance that we haven't reserved any metadata at the beginning because we think we don't need. While the 1st operation free a extent block while the 2nd operation need one. Our current code can't handle this. So this patch try to re-use the freed extent block so that we can pass the scenario above. For more details about the bug, see [RFC] metadata alloc fix in machines which has PAGE_SIZE > CLUSTER_SIZE http://oss.oracle.com/pipermail/ocfs2-devel/2009-March/004185.html. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/ocfs2/alloc.h | 7 +++++++ fs/ocfs2/suballoc.c | 23 +++++++++++++++++++++++ fs/ocfs2/suballoc.h | 3 +++ fs/ocfs2/xattr.c | 6 +++--- 5 files changed, 83 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index cb23d12..c63a7c5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -893,7 +893,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, struct buffer_head *bhs[]) { int count, status, i; - u16 suballoc_bit_start; + u16 slot, suballoc_bit_start; u32 num_got; u64 first_blkno; struct ocfs2_super *osb @@ -908,6 +908,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, handle, meta_ac, wanted - count, + &slot, &suballoc_bit_start, &num_got, &first_blkno); @@ -939,7 +940,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle, strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); eb->h_blkno = cpu_to_le64(first_blkno); eb->h_fs_generation = cpu_to_le32(osb->fs_generation); - eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); + eb->h_suballoc_slot = cpu_to_le16(slot); eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); eb->h_list.l_count cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); @@ -4878,6 +4879,7 @@ static int __ocfs2_mark_extent_written(handle_t *handle, struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; struct ocfs2_merge_ctxt ctxt; struct ocfs2_extent_list *rightmost_el; + struct ocfs2_alloc_context local_meta_ac; if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { ret = -EIO; @@ -4930,6 +4932,17 @@ static int __ocfs2_mark_extent_written(handle_t *handle, split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, ctxt.c_split_covers_rec); + /* + * init dealloc in meta_ac here so that we can reuse the freed extent + * block in case. + * If the caller doesn't give us a meta_ac, we just fake one and add + * dealloc in it. + */ + if (!meta_ac) { + memset(&local_meta_ac, 0, sizeof(local_meta_ac)); + meta_ac = &local_meta_ac; + } + meta_ac->dealloc = dealloc; if (ctxt.c_contig_type == CONTIG_NONE) { if (ctxt.c_split_covers_rec) ret = ocfs2_replace_extent_rec(handle, et, path, el, @@ -6140,6 +6153,38 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, return ret; } +int ocfs2_claim_bit_from_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u16 *slot, u16 *suballoc_bit, + unsigned int *num_bits, u64 *blkno) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + struct ocfs2_cached_block_free *tmp; + + if (!fl || !fl->f_first) + return -ENOMEM; + + tmp = fl->f_first->free_next; + + *slot = fl->f_slot; + *suballoc_bit = fl->f_first->free_bit; + *blkno = fl->f_first->free_blk; + *num_bits = 1; + + mlog(0, "claim blkno %llu suballoc_bit %u, slot %u\n", + (unsigned long long)*blkno, *suballoc_bit, *slot); + + kfree(fl->f_first); + fl->f_first = tmp; + + /* Free the suballocator if there is none. */ + if (!fl->f_first) { + ctxt->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + + return 0; +} + static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, struct ocfs2_cached_block_free *head) { diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 07f6315..cd80410 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -188,10 +188,17 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) } int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, u64 blkno, unsigned int bit); +static inline int ocfs2_dealloc_has_block(struct ocfs2_cached_dealloc_ctxt *c) +{ + return c->c_first_suballocator != NULL; +} static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) { return c->c_global_allocator != NULL; } +int ocfs2_claim_bit_from_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u16 *slot, u16 *suballoc_bit, + unsigned int *num_bits, u64 *blkno); int ocfs2_run_deallocs(struct ocfs2_super *osb, struct ocfs2_cached_dealloc_ctxt *ctxt); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f6712ab..0cee488 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1586,6 +1586,7 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb, handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, + u16 *slot, u16 *suballoc_bit_start, unsigned int *num_bits, u64 *blkno_start) @@ -1594,6 +1595,27 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb, u64 bg_blkno; BUG_ON(!ac); + + /* + * If there is no space reserved in ac, check whether we have + * free some before in dealloc. If yes, allocate one form it. + */ + if (ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)) { + mlog(0, "alloc context doesn't have enough meta data reserved. " + "It has %d, we need %u\n", + ac->ac_bits_given - ac->ac_bits_given, bits_wanted); + if (ac->dealloc && ocfs2_dealloc_has_block(ac->dealloc)) { + status = ocfs2_claim_bit_from_dealloc(ac->dealloc, + slot, + suballoc_bit_start, + num_bits, + blkno_start); + if (!status) + goto bail; + /*fail through, so that it will BUG out. */ + } + } + BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); BUG_ON(ac->ac_which != OCFS2_AC_USE_META); @@ -1611,6 +1633,7 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb, } atomic_inc(&osb->alloc_stats.bg_allocs); + *slot = ac->ac_alloc_slot; *blkno_start = bg_blkno + (u64) *suballoc_bit_start; ac->ac_bits_given += (*num_bits); status = 0; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index e3c13c7..d271a12 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -34,6 +34,7 @@ typedef int (group_search_t)(struct inode *, u16 *, /* *bit_off */ u16 *); /* *bits_found */ +struct ocfs2_cached_dealloc_ctxt; struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ struct buffer_head *ac_bh; /* file entry bh */ @@ -54,6 +55,7 @@ struct ocfs2_alloc_context { u64 ac_last_group; u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + struct ocfs2_cached_dealloc_ctxt *dealloc; }; void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); @@ -83,6 +85,7 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb, handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, + u16 *slot, u16 *suballoc_bit_start, u32 *num_bits, u64 *blkno_start); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 127e9d2..f324c65 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2093,7 +2093,7 @@ static int ocfs2_xattr_block_set(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; handle_t *handle = ctxt->handle; struct ocfs2_xattr_block *xblk = NULL; - u16 suballoc_bit_start; + u16 slot, suballoc_bit_start; u32 num_got; u64 first_blkno; int ret; @@ -2108,7 +2108,7 @@ static int ocfs2_xattr_block_set(struct inode *inode, } ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1, - &suballoc_bit_start, &num_got, + &slot, &suballoc_bit_start, &num_got, &first_blkno); if (ret < 0) { mlog_errno(ret); @@ -2131,7 +2131,7 @@ static int ocfs2_xattr_block_set(struct inode *inode, xblk = (struct ocfs2_xattr_block *)new_bh->b_data; memset(xblk, 0, inode->i_sb->s_blocksize); strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); - xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_slot = cpu_to_le16(slot); xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); xblk->xb_blkno = cpu_to_le64(first_blkno); -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 02/42] ocfs2: Define refcount tree structure.
Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/ocfs2_fs.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 files changed, 51 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c7ae45a..b3e7bfd 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -66,6 +66,7 @@ #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" +#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -154,6 +155,9 @@ /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 +/* Refcount tree support */ +#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -217,6 +221,7 @@ #define OCFS2_HAS_XATTR_FL (0x0002) #define OCFS2_INLINE_XATTR_FL (0x0004) #define OCFS2_INDEXED_DIR_FL (0x0008) +#define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ #define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ @@ -235,8 +240,14 @@ /* * Extent record flags (e_node.leaf.flags) */ -#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but - * unwritten */ +#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but + * unwritten */ +#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference + * counted in an associated + * refcount tree */ +#define OCFS2_EXT_REFCOUNT_RECORD (0x04) /* Extent record is the + * leaf of a refcount + * tree */ /* * ioctl commands @@ -465,7 +476,14 @@ struct ocfs2_extent_rec { __u8 e_flags; /* Extent flags */ }; }; - __le64 e_blkno; /* Physical disk offset, in blocks */ + union { + __le64 e_blkno; /* Physical disk offset, in blocks */ + struct { + __le32 e_refcount; /* Refrence count for + this extent. */ + __le32 e_reserved2; + }; + }; /*10*/ }; @@ -705,7 +723,8 @@ struct ocfs2_dinode { __le16 i_dyn_features; __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ -/*88*/ __le64 i_reserved2[6]; + __le64 i_refcount_loc; +/*90*/ __le64 i_reserved2[5]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -805,6 +824,24 @@ struct ocfs2_group_desc /*40*/ __u8 bg_bitmap[0]; }; +struct ocfs2_refcount_block { +/*00*/ __u8 rf_signature[8]; /* Signature for verification */ + __le16 rf_suballoc_slot; /* Slot suballocator this block + belongs to */ + __le16 rf_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 rf_fs_generation; /* Must match superblock */ +/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ + struct ocfs2_block_check rf_check; /* Error checking */ +/*20*/ __le64 rf_last_eb_blk; /* Pointer to last extent block */ + __le32 rf_count; /* Number of inodes sharing this + refcount tree */ + __le32 rf_clusters; +/*30*/ __le64 rf_reserved2[10]; +/*80*/ struct ocfs2_extent_list rf_list; /* Extent record list */ +/* Actual on-disk size is one block */ +}; + /* * On disk extended attribute structure for OCFS2. */ @@ -1192,6 +1229,16 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } + +static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 03/42] ocfs2: Add metaecc for ocfs2_refcount_block.
Add metaecc and journal trigger for ocfs2_refcount_block. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/journal.c | 15 +++++++++++++++ fs/ocfs2/journal.h | 3 +++ 2 files changed, 18 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e2c2d0a..6159cd8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -467,6 +467,14 @@ static struct ocfs2_triggers eb_triggers = { .ot_offset = offsetof(struct ocfs2_extent_block, h_check), }; +static struct ocfs2_triggers rb_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), +}; + static struct ocfs2_triggers gd_triggers = { .ot_triggers = { .t_commit = ocfs2_commit_trigger, @@ -575,6 +583,13 @@ int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, type); } +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, + type); +} + int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3c08d69..0eef436 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -264,6 +264,9 @@ int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, /* ocfs2_extent_block */ int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); +/* ocfs2_refcount_block */ +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); /* ocfs2_group_desc */ int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 04/42] ocfs2: Add ocfs2_read_refcount_block.
Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/Makefile | 1 + fs/ocfs2/cluster/masklog.c | 1 + fs/ocfs2/cluster/masklog.h | 1 + fs/ocfs2/ocfs2.h | 3 + fs/ocfs2/refcounttree.c | 99 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 105 insertions(+), 0 deletions(-) create mode 100644 fs/ocfs2/refcounttree.c diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 0159607..31f25ce 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -28,6 +28,7 @@ ocfs2-objs := \ locks.o \ mmap.o \ namei.o \ + refcounttree.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 96df541..1cd2934 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(EXPORT), define_mask(XATTR), define_mask(QUOTA), + define_mask(REFCOUNT), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 7e72a81..d233ec2 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -114,6 +114,7 @@ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ #define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ +#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8374b6b..fb2d092 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -511,6 +511,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) #define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) +#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ + (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c new file mode 100644 index 0000000..9790032 --- /dev/null +++ b/fs/ocfs2/refcounttree.c @@ -0,0 +1,99 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.c + * + * Copyright (C) 2004, 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#define MLOG_MASK_PREFIX ML_REFCOUNT +#include <cluster/masklog.h> +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "suballoc.h" +#include "journal.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "blockcheck.h" + +static int ocfs2_validate_refcount_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_refcount_block *rb + (struct ocfs2_refcount_block *)bh->b_data; + + mlog(0, "Validating refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for extent block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + + if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { + ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + return -EINVAL; + } + + if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid " + "rf_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, + u64 rb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, rb_blkno, &tmp, + ocfs2_validate_refcount_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 05/42] ocfs2: Basic tree root operation.
Add basic refcount tree root operation. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/journal.h | 3 + fs/ocfs2/refcounttree.c | 271 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 26 +++++ 3 files changed, 300 insertions(+), 0 deletions(-) create mode 100644 fs/ocfs2/refcounttree.h diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 0eef436..bae77f4 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -435,6 +435,9 @@ static inline int ocfs2_rename_credits(struct super_block *sb) + OCFS2_INODE_UPDATE_CREDITS \ + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) +#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + + OCFS2_SUBALLOC_ALLOC + 1) +#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) /* * Please note that the caller must make sure that root_el is the root * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9790032..ced1050 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -26,6 +26,9 @@ #include "super.h" #include "buffer_head_io.h" #include "blockcheck.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dlmglue.h" static int ocfs2_validate_refcount_block(struct super_block *sb, struct buffer_head *bh) @@ -97,3 +100,271 @@ static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, return rc; } + +/* + * Create a refcount tree for an inode. + * We take for granted that the inode is already locked. + */ +int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *rb; + u16 slot, suballoc_bit_start; + u32 num_got; + u64 first_blkno; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(di->i_refcount_loc); + + mlog(0, "create tree for inode %lu\n", inode->i_ino); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &slot, &suballoc_bit_start, &num_got, + &first_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_refcount_block. */ + rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(rb, 0, inode->i_sb->s_blocksize); + strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + rb->rf_suballoc_slot = cpu_to_le16(slot); + rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); + rb->rf_blkno = cpu_to_le64(first_blkno); + rb->rf_count = cpu_to_le32(1); + rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(osb->sb)); + + ocfs2_journal_dirty(handle, new_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(first_blkno); + spin_unlock(&oi->ip_lock); + + mlog(0, "created tree for inode %lu, refblock %llu\n", + inode->i_ino, (unsigned long long)first_blkno); + + ocfs2_journal_dirty(handle, di_bh); +out_commit: + ocfs2_commit_trans(osb, handle); +out: + brelse(new_bh); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +int ocfs2_set_refcount_tree(struct inode *inode, + struct buffer_head *di_bh, + u64 refcount_loc) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ref_bh = NULL; + struct ocfs2_refcount_block *rb; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(di->i_refcount_loc); + + ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + refcount_loc, &ref_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), ref_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + rb = (struct ocfs2_refcount_block *)ref_bh->b_data; + le32_add_cpu(&rb->rf_count, 1); + + ocfs2_journal_dirty(handle, ref_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(refcount_loc); + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); +out_commit: + ocfs2_commit_trans(osb, handle); +out: + brelse(ref_bh); + + return ret; +} + +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_block *rb; + struct inode *alloc_inode = NULL; + struct buffer_head *alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + int credits = OCFS2_INODE_UPDATE_CREDITS + 1; + u64 blk = 0, bg_blkno = 0; + u16 bit = 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + le64_to_cpu(di->i_refcount_loc), + &blk_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + rb = (struct ocfs2_refcount_block *)blk_bh->b_data; + + /* + * If we are the last user, we need to free the block. + * So lock the allocator ahead. + */ + if (le32_to_cpu(rb->rf_count) == 1) { + blk = le64_to_cpu(rb->rf_blkno); + bit = le16_to_cpu(rb->rf_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot)); + if (!alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + credits += OCFS2_SUBALLOC_FREE; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), blk_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = 0; + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + + le32_add_cpu(&rb->rf_count , -1); + ocfs2_journal_dirty(handle, blk_bh); + + if (!rb->rf_count) { + ret = ocfs2_free_suballoc_bits(handle, alloc_inode, + alloc_bh, bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + if (alloc_inode) { + ocfs2_inode_unlock(alloc_inode, 1); + brelse(alloc_bh); + } +out_mutex: + if (alloc_inode) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + } +out: + brelse(blk_bh); + + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h new file mode 100644 index 0000000..9f4bdac --- /dev/null +++ b/fs/ocfs2/refcounttree.h @@ -0,0 +1,26 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.h + * + * Copyright (C) 2004, 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_REFCOUNTTREE_H +#define OCFS2_REFCOUNTTREE_H + +int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh); +int ocfs2_set_refcount_tree(struct inode *inode, + struct buffer_head *di_bh, + u64 blkno); +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); + +#endif /* OCFS2_REFCOUNTTREE_H */ -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 06/42] ocfs2: hook remove refcount tree into truncate.
Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 5 +++++ 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index c63a7c5..9088d7b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -51,6 +51,7 @@ #include "xattr.h" #include "buffer_head_io.h" +#include "refcounttree.h" /* @@ -7295,6 +7296,10 @@ start: goto start; bail: + if (!status && OCFS2_I(inode)->ip_clusters == 0) { + /* remove the refcount tree. */ + status = ocfs2_remove_refcount_tree(inode, fe_bh); + } ocfs2_schedule_truncate_log_flush(osb, 1); -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 07/42] ocfs2: Wrap ocfs2_extent_contig in ocfs2_extent_tree.
Add a new operation eo_ocfs2_extent_contig int extent tree's operation. So that with the new refcount tree, we can calculate whether they are contiguous in its own function. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 57 ++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 41 insertions(+), 16 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9088d7b..e1ddf9b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -53,7 +53,17 @@ #include "buffer_head_io.h" #include "refcounttree.h" +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, +}; +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); /* * Operations for a specific extent tree type. * @@ -123,6 +133,16 @@ struct ocfs2_extent_tree_operations { * to 0 (unlimited). Optional. */ void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); + + /* + * ->eo_ocfs2_extent_contig test whether the 2 ocfs2_extent_rec + * are contiguous or not. Optional. Don't need to set it if use + * ocfs2_extent_rec as the tree leaf. + */ + enum ocfs2_contig_type + (*eo_ocfs2_extent_contig)(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); }; @@ -339,6 +359,9 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, et->et_max_leaf_clusters = 0; else et->et_ops->eo_fill_max_leaf_clusters(et); + + if (!et->et_ops->eo_ocfs2_extent_contig) + et->et_ops->eo_ocfs2_extent_contig = ocfs2_extent_rec_contig; } void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, @@ -404,6 +427,16 @@ static inline int ocfs2_et_root_journal_access(handle_t *handle, type); } +static inline enum ocfs2_contig_type + ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + return et->et_ops->eo_ocfs2_extent_contig( + ocfs2_metadata_cache_get_super(et->et_ci), + ext, insert_rec); +} + static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) { @@ -676,17 +709,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) return ret; } -enum ocfs2_contig_type { - CONTIG_NONE = 0, - CONTIG_LEFT, - CONTIG_RIGHT, - CONTIG_LEFTRIGHT, -}; - - /* * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and - * ocfs2_extent_contig only work properly against leaf nodes! + * ocfs2_extent_rec_contig only work properly against leaf nodes! */ static int ocfs2_block_extent_contig(struct super_block *sb, struct ocfs2_extent_rec *ext, @@ -712,9 +737,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, } static enum ocfs2_contig_type - ocfs2_extent_contig(struct super_block *sb, - struct ocfs2_extent_rec *ext, - struct ocfs2_extent_rec *insert_rec) + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) { u64 blkno = le64_to_cpu(insert_rec->e_blkno); @@ -4240,7 +4265,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (split_rec->e_cpos == el->l_recs[index].e_cpos) ret = CONTIG_RIGHT; } else { - ret = ocfs2_extent_contig(sb, rec, split_rec); + ret = ocfs2_et_extent_contig(et, rec, split_rec); } } @@ -4285,7 +4310,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (rec) { enum ocfs2_contig_type contig_type; - contig_type = ocfs2_extent_contig(sb, rec, split_rec); + contig_type = ocfs2_et_extent_contig(et, rec, split_rec); if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) ret = CONTIG_LEFTRIGHT; @@ -4313,8 +4338,8 @@ static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - contig_type = ocfs2_extent_contig(ocfs2_metadata_cache_get_super(et->et_ci), - &el->l_recs[i], insert_rec); + contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], + insert_rec); if (contig_type != CONTIG_NONE) { insert->ins_contig_index = i; break; -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 08/42] ocfs2: Wrap manipulation of leaf extent record in b-tree operation.
In b-tree operation, when we merge/split an already existing extent record, we need to modify e_blkno, e_cpos and e_leaf_clusters. While with the ocfs2_extent_rec in a refcount tree, some fields now have different meanings. So wrap the modification of leaf extent record so that we can make it usable for the refcount tree. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 140 ++++++++++++++++++++++++++++++++++-------------------- 1 files changed, 89 insertions(+), 51 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e1ddf9b..e88c949 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -60,6 +60,11 @@ enum ocfs2_contig_type { CONTIG_LEFTRIGHT, }; +enum ocfs2_leaf_rec_change_type { + LEAF_CHANGE_LEFT = 0, + LEAF_CHANGE_RIGHT, +}; + static enum ocfs2_contig_type ocfs2_extent_rec_contig(struct super_block *sb, struct ocfs2_extent_rec *ext, @@ -143,6 +148,18 @@ struct ocfs2_extent_tree_operations { (*eo_ocfs2_extent_contig)(struct super_block *sb, struct ocfs2_extent_rec *ext, struct ocfs2_extent_rec *insert_rec); + + /* + * Change the specified leaf extent record. + * type indicates the action we need to do with the rec. + * If clusters < 0, it means shrinking instead of extension. + * Optional. Don't need to set it if use ocfs2_extent_rec as the + * tree leaf. + */ + void (*eo_leaf_rec_change)(struct super_block *sb, + struct ocfs2_extent_rec *rec, + enum ocfs2_leaf_rec_change_type type, + int clusters); }; @@ -339,6 +356,24 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, }; +static void ocfs2_leaf_extent_rec_change(struct super_block *sb, + struct ocfs2_extent_rec *rec, + enum ocfs2_leaf_rec_change_type type, + int clusters) +{ + if (type == LEAF_CHANGE_LEFT) { + le32_add_cpu(&rec->e_cpos, -clusters); + if (clusters > 0) + le64_add_cpu(&rec->e_blkno, + -ocfs2_clusters_to_blocks(sb, clusters)); + else + le64_add_cpu(&rec->e_blkno, + ocfs2_clusters_to_blocks(sb, -clusters)); + } + + le16_add_cpu(&rec->e_leaf_clusters, clusters); +} + static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct ocfs2_caching_info *ci, struct buffer_head *bh, @@ -362,6 +397,9 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!et->et_ops->eo_ocfs2_extent_contig) et->et_ops->eo_ocfs2_extent_contig = ocfs2_extent_rec_contig; + + if (!et->et_ops->eo_leaf_rec_change) + et->et_ops->eo_leaf_rec_change = ocfs2_leaf_extent_rec_change; } void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, @@ -437,6 +475,16 @@ static inline enum ocfs2_contig_type ext, insert_rec); } +static inline void ocfs2_et_leaf_rec_change(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec, + enum ocfs2_leaf_rec_change_type type, + int clusters) +{ + return et->et_ops->eo_leaf_rec_change( + ocfs2_metadata_cache_get_super(et->et_ci), + rec, type, clusters); +} + static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) { @@ -3302,13 +3350,11 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, goto out; } - le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); + ocfs2_et_leaf_rec_change(et, left_rec, + LEAF_CHANGE_RIGHT, -split_clusters); - le32_add_cpu(&right_rec->e_cpos, -split_clusters); - le64_add_cpu(&right_rec->e_blkno, - -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), - split_clusters)); - le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); + ocfs2_et_leaf_rec_change(et, right_rec, + LEAF_CHANGE_LEFT, split_clusters); ocfs2_cleanup_merge(el, index); @@ -3479,13 +3525,11 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, has_empty_extent = 0; } else - le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); + ocfs2_et_leaf_rec_change(et, left_rec, + LEAF_CHANGE_RIGHT, split_clusters); - le32_add_cpu(&right_rec->e_cpos, split_clusters); - le64_add_cpu(&right_rec->e_blkno, - ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), - split_clusters)); - le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); + ocfs2_et_leaf_rec_change(et, right_rec, + LEAF_CHANGE_LEFT, -split_clusters); ocfs2_cleanup_merge(el, index); @@ -3664,33 +3708,27 @@ out: return ret; } -static void ocfs2_subtract_from_rec(struct super_block *sb, +static void ocfs2_subtract_from_rec(struct ocfs2_extent_tree *et, enum ocfs2_split_type split, struct ocfs2_extent_rec *rec, struct ocfs2_extent_rec *split_rec) { - u64 len_blocks; - - len_blocks = ocfs2_clusters_to_blocks(sb, - le16_to_cpu(split_rec->e_leaf_clusters)); + int clusters = le16_to_cpu(split_rec->e_leaf_clusters); if (split == SPLIT_LEFT) { /* * Region is on the left edge of the existing * record. */ - le32_add_cpu(&rec->e_cpos, - le16_to_cpu(split_rec->e_leaf_clusters)); - le64_add_cpu(&rec->e_blkno, len_blocks); - le16_add_cpu(&rec->e_leaf_clusters, - -le16_to_cpu(split_rec->e_leaf_clusters)); + ocfs2_et_leaf_rec_change(et, rec, + LEAF_CHANGE_LEFT, -clusters); } else { /* * Region is on the right edge of the existing * record. */ - le16_add_cpu(&rec->e_leaf_clusters, - -le16_to_cpu(split_rec->e_leaf_clusters)); + ocfs2_et_leaf_rec_change(et, rec, + LEAF_CHANGE_RIGHT, -clusters); } } @@ -3707,6 +3745,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, int i = insert->ins_contig_index; unsigned int range; struct ocfs2_extent_rec *rec; + unsigned int insert_clusters = le16_to_cpu(insert_rec->e_leaf_clusters); BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); @@ -3714,7 +3753,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); BUG_ON(i == -1); rec = &el->l_recs[i]; - ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + ocfs2_subtract_from_rec(et, insert->ins_split, rec, insert_rec); goto rotate; @@ -3726,11 +3765,14 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, if (insert->ins_contig != CONTIG_NONE) { rec = &el->l_recs[i]; if (insert->ins_contig == CONTIG_LEFT) { - rec->e_blkno = insert_rec->e_blkno; - rec->e_cpos = insert_rec->e_cpos; - } - le16_add_cpu(&rec->e_leaf_clusters, - le16_to_cpu(insert_rec->e_leaf_clusters)); + ocfs2_et_leaf_rec_change(et, rec, + LEAF_CHANGE_LEFT, + insert_clusters); + } else + ocfs2_et_leaf_rec_change(et, rec, + LEAF_CHANGE_RIGHT, + insert_clusters); + return; } @@ -3985,8 +4027,7 @@ static void ocfs2_split_record(struct ocfs2_extent_tree *et, } rec = &el->l_recs[index]; - ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), - split, rec, split_rec); + ocfs2_subtract_from_rec(et, split, rec, split_rec); ocfs2_rotate_leaf(insert_el, split_rec); } @@ -4724,24 +4765,23 @@ leave: return status; } -static void ocfs2_make_right_split_rec(struct super_block *sb, +static void ocfs2_make_right_split_rec(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *split_rec, u32 cpos, struct ocfs2_extent_rec *rec) { u32 rec_cpos = le32_to_cpu(rec->e_cpos); - u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); - - memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); - split_rec->e_cpos = cpu_to_le32(cpos); - split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); - - split_rec->e_blkno = rec->e_blkno; - le64_add_cpu(&split_rec->e_blkno, - ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); + /* + * We want to create a right split rec which start from cpos + * while end at the original rec's end. So we first make + * the split_rec the same as rec and then shrink it to the + * specified size. + */ + *split_rec = *rec; - split_rec->e_flags = rec->e_flags; + ocfs2_et_leaf_rec_change(et, split_rec, LEAF_CHANGE_LEFT, + rec_cpos - cpos); } static int ocfs2_split_and_insert(handle_t *handle, @@ -4808,8 +4848,7 @@ leftright: */ insert.ins_split = SPLIT_RIGHT; - ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), - &tmprec, insert_range, &rec); + ocfs2_make_right_split_rec(et, &tmprec, insert_range, &rec); split_rec = tmprec; @@ -5090,8 +5129,7 @@ static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, */ el = path_leaf_el(path); rec = &el->l_recs[index]; - ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), - &split_rec, new_range, rec); + ocfs2_make_right_split_rec(et, &split_rec, new_range, rec); depth = path->p_tree_depth; if (depth > 0) { @@ -5260,12 +5298,12 @@ static int ocfs2_truncate_rec(handle_t *handle, } } else if (le32_to_cpu(rec->e_cpos) == cpos) { /* Remove leftmost portion of the record. */ - le32_add_cpu(&rec->e_cpos, len); - le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); - le16_add_cpu(&rec->e_leaf_clusters, -len); + ocfs2_et_leaf_rec_change(et, rec, + LEAF_CHANGE_LEFT, -len); } else if (rec_range == trunc_range) { /* Remove rightmost portion of the record */ - le16_add_cpu(&rec->e_leaf_clusters, -len); + ocfs2_et_leaf_rec_change(et, rec, + LEAF_CHANGE_RIGHT, -len); if (is_rightmost_tree_rec) ocfs2_adjust_rightmost_records(handle, et, path, rec); } else { -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 09/42] ocfs2: Pass ocfs2_insert_extent an inserted leaf record.
In old b-tree operation, ocfs2_insert_extent organize an inserted leaf extent record by itself. So pull the creation of ocfs2_extent_rec up into the caller so that it can handle ocfs2_extent_rec in a refcount tree also. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 40 +++++++++++++++++++--------------------- fs/ocfs2/alloc.h | 5 +---- fs/ocfs2/dir.c | 14 ++++++++++---- fs/ocfs2/xattr.c | 9 +++++++-- 4 files changed, 37 insertions(+), 31 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e88c949..d79a62e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4593,34 +4593,21 @@ out: */ int ocfs2_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, - u32 cpos, - u64 start_blk, - u32 new_clusters, - u8 flags, + struct ocfs2_extent_rec *rec, struct ocfs2_alloc_context *meta_ac) { int status; int uninitialized_var(free_records); struct buffer_head *last_eb_bh = NULL; struct ocfs2_insert_type insert = {0, }; - struct ocfs2_extent_rec rec; - - mlog(0, "add %u clusters at position %u to owner %llu\n", - new_clusters, cpos, - (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - memset(&rec, 0, sizeof(rec)); - rec.e_cpos = cpu_to_le32(cpos); - rec.e_blkno = cpu_to_le64(start_blk); - rec.e_leaf_clusters = cpu_to_le16(new_clusters); - rec.e_flags = flags; - status = ocfs2_et_insert_check(et, &rec); + status = ocfs2_et_insert_check(et, rec); if (status) { mlog_errno(status); goto bail; } - status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(et, &last_eb_bh, rec, &free_records, &insert); if (status < 0) { mlog_errno(status); @@ -4644,11 +4631,11 @@ int ocfs2_insert_extent(handle_t *handle, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(handle, et, &rec, &insert); + status = ocfs2_do_insert_extent(handle, et, rec, &insert); if (status < 0) mlog_errno(status); else - ocfs2_et_extent_map_insert(et, &rec); + ocfs2_et_extent_map_insert(et, rec); bail: brelse(last_eb_bh); @@ -4681,6 +4668,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, u8 flags = 0; struct ocfs2_super *osb OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + struct ocfs2_extent_rec rec; BUG_ON(!clusters_to_add); @@ -4735,8 +4723,13 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, mlog(0, "Allocating %u clusters at block %u for owner %llu\n", num_bits, bit_off, (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - status = ocfs2_insert_extent(handle, et, *logical_offset, block, - num_bits, flags, meta_ac); + + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(*logical_offset); + rec.e_blkno = cpu_to_le64(block); + rec.e_leaf_clusters = cpu_to_le16(num_bits); + rec.e_flags = flags; + status = ocfs2_insert_extent(handle, et, &rec, meta_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -7183,13 +7176,18 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ocfs2_journal_dirty(handle, di_bh); if (has_data) { + struct ocfs2_extent_rec rec; + /* * An error at this point should be extremely rare. If * this proves to be false, we could always re-build * the in-inode data from our pages. */ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); - ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); + memset(&rec, 0, sizeof(rec)); + rec.e_blkno = cpu_to_le64(block); + rec.e_leaf_clusters = cpu_to_le16(1); + ret = ocfs2_insert_extent(handle, &et, &rec, NULL); if (ret) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index cd80410..4eeb59f 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -89,10 +89,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, struct ocfs2_alloc_context; int ocfs2_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, - u32 cpos, - u64 start_blk, - u32 new_clusters, - u8 flags, + struct ocfs2_extent_rec *rec, struct ocfs2_alloc_context *meta_ac); enum ocfs2_alloc_restarted { diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b361bfb..7020bfa 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1412,6 +1412,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; struct ocfs2_extent_tree et; + struct ocfs2_extent_rec rec; int did_quota = 0; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh); @@ -1535,8 +1536,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ - ret = ocfs2_insert_extent(handle, &et, 0, blkno, len, - 0, NULL); + memset(&rec, 0, sizeof(rec)); + rec.e_blkno = cpu_to_le64(blkno); + rec.e_leaf_clusters = cpu_to_le16(len); + ret = ocfs2_insert_extent(handle, &et, &rec, NULL); if (ret) { mlog_errno(ret); goto out_commit; @@ -1567,8 +1570,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); - ret = ocfs2_insert_extent(handle, &et, 1, - blkno, len, 0, NULL); + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(1); + rec.e_blkno = cpu_to_le64(blkno); + rec.e_leaf_clusters = cpu_to_le16(len); + ret = ocfs2_insert_extent(handle, &et, &rec, NULL); if (ret) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f324c65..291ca13 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4253,6 +4253,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, handle_t *handle = ctxt->handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; + struct ocfs2_extent_rec rec; mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " "previous xattr blkno = %llu\n", @@ -4315,8 +4316,12 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", num_bits, (unsigned long long)block, v_start); - ret = ocfs2_insert_extent(handle, &et, v_start, block, - num_bits, 0, ctxt->meta_ac); + + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(v_start); + rec.e_blkno = cpu_to_le64(block); + rec.e_leaf_clusters = cpu_to_le16(num_bits); + ret = ocfs2_insert_extent(handle, &et, &rec, ctxt->meta_ac); if (ret < 0) { mlog_errno(ret); goto leave; -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 10/42] ocfs2: Abstract extent split process.
ocfs2_mark_extent_written actually does the following things: 1. check the parameters. 2. initialize the left_path and split_rec. 3. call __ocfs2_mark_extent_written. it will do: 1) check the flags of unwritten 2) do the real split work. The whole process is packed tightly somehow. So this patch will abstract 2 different functions so that future b-tree operation can work with it. 1. __ocfs2_split_extent will accept path and split_rec and do the real split work. 2. ocfs2_change_extent_flag will accept a new flag and initialize path and split_rec. So now ocfs2_mark_extent_written will do: 1. check the parameters. 2. call ocfs2_change_extent_flag. 1) initalize the left_path and split_rec. 2) check whether the new flags conflict with the old one. 3) call __ocfs2_split_extent to do the split. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 142 ++++++++++++++++++++++++++++++++---------------------- 1 files changed, 85 insertions(+), 57 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d79a62e..912f29c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4904,10 +4904,9 @@ out: } /* - * Mark part or all of the extent record at split_index in the leaf - * pointed to by path as written. This removes the unwritten - * extent flag. - * + * Split part or all of the extent record at split_index in the leaf + * pointed to by path. Merge with the contiguous extent record if needed. + * Care is taken to handle contiguousness so as to not grow the tree. * * meta_ac is not strictly necessary - we only truly need it if growth @@ -4923,13 +4922,13 @@ out: * have been brought into cache (and pinned via the journal), so the * extra overhead is not expressed in terms of disk reads. */ -static int __ocfs2_mark_extent_written(handle_t *handle, - struct ocfs2_extent_tree *et, - struct ocfs2_path *path, - int split_index, - struct ocfs2_extent_rec *split_rec, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +static int __ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -4939,12 +4938,6 @@ static int __ocfs2_mark_extent_written(handle_t *handle, struct ocfs2_extent_list *rightmost_el; struct ocfs2_alloc_context local_meta_ac; - if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { - ret = -EIO; - mlog_errno(ret); - goto out; - } - if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { @@ -5024,43 +5017,20 @@ out: return ret; } -/* - * Mark the already-existing extent at cpos as written for len clusters. - * - * If the existing extent is larger than the request, initiate a - * split. An attempt will be made at merging with adjacent extents. - * - * The caller is responsible for passing down meta_ac if we'll need it. - */ -int ocfs2_mark_extent_written(struct inode *inode, - struct ocfs2_extent_tree *et, - handle_t *handle, u32 cpos, u32 len, u32 phys, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +static int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags) { int ret, index; - u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); struct ocfs2_extent_rec split_rec; struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el; - - mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", - inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); - - if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - ret = -EROFS; - goto out; - } - - /* - * XXX: This should be fixed up so that we just re-insert the - * next extent records. - */ - ocfs2_et_extent_map_truncate(et, 0); + struct ocfs2_extent_rec *rec; left_path = ocfs2_new_path_from_et(et); if (!left_path) { @@ -5078,30 +5048,88 @@ int ocfs2_mark_extent_written(struct inode *inode, index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " + ocfs2_error(sb, + "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; goto out; } + rec = &el->l_recs[index]; + if ((new_flags && (rec->e_flags & new_flags)) || + (clear_flags && !(rec->e_flags & clear_flags))) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); split_rec.e_cpos = cpu_to_le32(cpos); split_rec.e_leaf_clusters = cpu_to_le16(len); split_rec.e_blkno = cpu_to_le64(start_blkno); - split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; - split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; + split_rec.e_flags = rec->e_flags; + if (new_flags) + split_rec.e_flags |= new_flags; + else + split_rec.e_flags &= ~clear_flags; - ret = __ocfs2_mark_extent_written(handle, et, left_path, - index, &split_rec, meta_ac, - dealloc); + ret = __ocfs2_split_extent(handle, et, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); out: ocfs2_free_path(left_path); return ret; + +} + +/* + * Mark the already-existing extent at cpos as written for len clusters. + * This removes the unwritten extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n", + inode->i_ino, cpos, len, phys); + + if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " + "that are being written to, but the feature bit " + "is not set in the super block.", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EROFS; + goto out; + } + + /* + * XXX: This should be fixed up so that we just re-insert the + * next extent records. + */ + ocfs2_et_extent_map_truncate(et, 0); + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + 0, OCFS2_EXT_UNWRITTEN); + if (ret) + mlog_errno(ret); + +out: + return ret; } static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 11/42] ocfs2: Add refcount b-tree as a new extent tree.
Add refcount b-tree as a new extent tree so that it can use the b-tree to store and maniuplate ocfs2_refcount_rec. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/alloc.h | 3 ++ 2 files changed, 81 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 912f29c..6a7dada 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -163,6 +163,9 @@ struct ocfs2_extent_tree_operations { }; +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right); + /* * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check * in the methods. @@ -356,6 +359,73 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, }; +static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + et->et_root_el = &rb->rf_list; +} + +static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + rb->rf_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + return le64_to_cpu(rb->rf_last_eb_blk); +} + +static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + le32_add_cpu(&rb->rf_clusters, clusters); +} + +static enum ocfs2_contig_type + ocfs2_refcount_tree_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + if (ext->e_refcount != insert_rec->e_refcount) + return CONTIG_NONE; + + if (ocfs2_extents_adjacent(ext, insert_rec)) + return CONTIG_RIGHT; + + if (ocfs2_extents_adjacent(insert_rec, ext)) + return CONTIG_LEFT; + + return CONTIG_NONE; +} + +static void ocfs2_refcount_leaf_rec_change(struct super_block *sb, + struct ocfs2_extent_rec *rec, + enum ocfs2_leaf_rec_change_type type, + int clusters) +{ + if (type == LEAF_CHANGE_LEFT) + le32_add_cpu(&rec->e_cpos, -clusters); + + le16_add_cpu(&rec->e_leaf_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_refcount_tree_update_clusters, + .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, + .eo_ocfs2_extent_contig = ocfs2_refcount_tree_extent_contig, + .eo_leaf_rec_change = ocfs2_refcount_leaf_rec_change, +}; + static void ocfs2_leaf_extent_rec_change(struct super_block *sb, struct ocfs2_extent_rec *rec, enum ocfs2_leaf_rec_change_type type, @@ -426,6 +496,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, &ocfs2_xattr_value_et_ops); } +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, + NULL, &ocfs2_refcount_tree_et_ops); +} + static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 new_last_eb_blk) { diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4eeb59f..49eff75 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -77,6 +77,9 @@ struct ocfs2_xattr_value_buf; void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, struct ocfs2_caching_info *ci, struct ocfs2_xattr_value_buf *vb); +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); /* * Read an extent block into *bh. If *bh is NULL, a bh will be -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 12/42] ocfs2: export tree operation functions.
Now fs/ocfs2/alloc.c has more than 7000 lines. It contains our basic b-tree operation. Although we have already make our b-tree operation generic, the basic structrue ocfs2_path which is used to iterate one b-tree branch is still static and limited to only used in alloc.c. As refcount tree need them and I don't want to add any more b-tree unrelated code to alloc.c, export them out. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 74 ++++++++++++++++------------------------------------- fs/ocfs2/alloc.h | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 51 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 6a7dada..7221073 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -585,40 +585,12 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); - -/* - * Structures which describe a path through a btree, and functions to - * manipulate them. - * - * The idea here is to be as generic as possible with the tree - * manipulation code. - */ -struct ocfs2_path_item { - struct buffer_head *bh; - struct ocfs2_extent_list *el; -}; - -#define OCFS2_MAX_PATH_DEPTH 5 - -struct ocfs2_path { - int p_tree_depth; - ocfs2_journal_access_func p_root_access; - struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; -}; - -#define path_root_bh(_path) ((_path)->p_node[0].bh) -#define path_root_el(_path) ((_path)->p_node[0].el) -#define path_root_access(_path)((_path)->p_root_access) -#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) -#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) -#define path_num_items(_path) ((_path)->p_tree_depth + 1) - /* * Reset the actual path elements so that we can re-use the structure * to build another path. Generally, this involves freeing the buffer * heads. */ -static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) { int i, start = 0, depth = 0; struct ocfs2_path_item *node; @@ -647,7 +619,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) path->p_tree_depth = depth; } -static void ocfs2_free_path(struct ocfs2_path *path) +void ocfs2_free_path(struct ocfs2_path *path) { if (path) { ocfs2_reinit_path(path, 0); @@ -745,13 +717,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, return path; } -static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) { return ocfs2_new_path(path_root_bh(path), path_root_el(path), path_root_access(path)); } -static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) { return ocfs2_new_path(et->et_root_bh, et->et_root_el, et->et_root_journal_access); @@ -764,10 +736,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) * I don't like the way this function's name looks next to * ocfs2_journal_access_path(), but I don't have a better one. */ -static int ocfs2_path_bh_journal_access(handle_t *handle, - struct ocfs2_caching_info *ci, - struct ocfs2_path *path, - int idx) +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx) { ocfs2_journal_access_func access = path_root_access(path); @@ -784,9 +756,9 @@ static int ocfs2_path_bh_journal_access(handle_t *handle, /* * Convenience function to journal all components in a path. */ -static int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, - handle_t *handle, - struct ocfs2_path *path) +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path) { int i, ret = 0; @@ -1890,8 +1862,8 @@ static void find_path_ins(void *data, struct buffer_head *bh) ocfs2_path_insert_eb(fp->path, fp->index, bh); fp->index++; } -static int ocfs2_find_path(struct ocfs2_caching_info *ci, - struct ocfs2_path *path, u32 cpos) +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, u32 cpos) { struct find_path_data data; @@ -5000,13 +4972,13 @@ out: * have been brought into cache (and pinned via the journal), so the * extra overhead is not expressed in terms of disk reads. */ -static int __ocfs2_split_extent(handle_t *handle, - struct ocfs2_extent_tree *et, - struct ocfs2_path *path, - int split_index, - struct ocfs2_extent_rec *split_rec, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -5153,9 +5125,9 @@ static int ocfs2_change_extent_flag(handle_t *handle, else split_rec.e_flags &= ~clear_flags; - ret = __ocfs2_split_extent(handle, et, left_path, - index, &split_rec, meta_ac, - dealloc); + ret = ocfs2_split_extent(handle, et, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 49eff75..c89197a 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -109,6 +109,14 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); struct ocfs2_cached_dealloc_ctxt; +struct ocfs2_path; +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_mark_extent_written(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, @@ -255,4 +263,45 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) return !rec->e_leaf_clusters; } +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; + +#define OCFS2_MAX_PATH_DEPTH 5 + +struct ocfs2_path { + int p_tree_depth; + ocfs2_journal_access_func p_root_access; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; + +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_root_access(_path)((_path)->p_root_access) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) + +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root); +void ocfs2_free_path(struct ocfs2_path *path); +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + u32 cpos); +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path); +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et); +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx); +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path); #endif /* OCFS2_ALLOC_H */ -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 13/42] ocfs2: Add support for incrementing refcount in the tree.
Given a physical cpos and length, increment the refcount in the tree. If the extent has not been seen before, a refcount record is created for it. Refcount records may be merged or split by this operation. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/extent_map.c | 15 ++-- fs/ocfs2/extent_map.h | 5 + fs/ocfs2/refcounttree.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 208 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index dc9482c..40b5105 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, * eb_bh is NULL. Otherwise, eb_bh should point to the extent block * containing el. */ -static int ocfs2_figure_hole_clusters(struct inode *inode, - struct ocfs2_extent_list *el, - struct buffer_head *eb_bh, - u32 v_cluster, - u32 *num_clusters) +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) { int ret, i; struct buffer_head *next_eb_bh = NULL; @@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_extent_block(INODE_CACHE(inode), + ret = ocfs2_read_extent_block(ci, le64_to_cpu(eb->h_next_leaf_blk), &next_eb_bh); if (ret) { @@ -456,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, * field. */ if (hole_len) { - ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, + ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode), + el, eb_bh, v_cluster, &len); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index b7dd973..9942f47 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -61,6 +61,11 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)); +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters); static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, struct buffer_head **bh, int (*validate)(struct super_block *sb, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ced1050..3b6f327 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -29,6 +29,7 @@ #include "refcounttree.h" #include "sysfile.h" #include "dlmglue.h" +#include "extent_map.h" static int ocfs2_validate_refcount_block(struct super_block *sb, struct buffer_head *bh) @@ -368,3 +369,197 @@ out: return ret; } + +/* + * Given a cpos and len, try to find the refcount record which contains cpos. + * 1. If cpos can be found in one refcount record, return the record. + * 2. If cpos can't be found, return a fake record which start from cpos + * and end at a small value between cpos+len and start of the next record. + * This fake record has r_count = 0. + */ +static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + u32 cpos, unsigned int len, + struct ocfs2_extent_rec *ret_rec, + int *index) +{ + u32 hole_len; + int i, ret = 0; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct buffer_head *eb_bh = NULL; + + memset(ret_rec, 0, sizeof(*ret_rec)); + + ret = ocfs2_find_path(ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (ocfs2_is_empty_extent(rec)) { + mlog_bug_on_msg(i != 0, "Refcount tree %llu " + "has empty record in " + "block %llu, index %d\n", + (unsigned long long) + ocfs2_metadata_cache_owner(ci), + (unsigned long long) + path_leaf_bh(path)->b_blocknr, i); + continue; + } + + if (le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters) <= cpos) + continue; + else if (le32_to_cpu(rec->e_cpos) > cpos) + break; + + /* ok, cpos fail in this rec. Just return. */ + *ret_rec = *rec; + *index = i; + goto out; + } + + /* We meet with a hole here. */ + if (path->p_tree_depth) + eb_bh = path_leaf_bh(path); + ret = ocfs2_figure_hole_clusters(ci, el, eb_bh, cpos, &hole_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (hole_len > UINT_MAX) + hole_len = UINT_MAX; + else if (hole_len > len) + hole_len = len; + + ret_rec->e_cpos = cpu_to_le32(cpos); + ret_rec->e_leaf_clusters = cpu_to_le16(hole_len); + ret_rec->e_refcount = 0; + +out: + return ret; +} + +static int ocfs2_incre_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, int index) +{ + int ret; + struct ocfs2_extent_list *el = path_leaf_el(path); + struct ocfs2_extent_rec *rec = &el->l_recs[index]; + + ret = ocfs2_path_bh_journal_access(handle, ci, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + le32_add_cpu(&rec->e_refcount, 1); + + ret = ocfs2_journal_dirty(handle, path_leaf_bh(path)); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static int __ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0, index; + struct ocfs2_extent_rec rec; + unsigned int set_len = 0; + struct ocfs2_path *path = NULL; + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + mlog(0, "Tree owner %llu, add refcount start %u, len %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len); + + while (len) { + ret = ocfs2_get_refcount_rec(et->et_ci, path, cpos, len, + &rec, &index); + if (ret) { + mlog_errno(ret); + goto out; + } + + set_len = le16_to_cpu(rec.e_leaf_clusters); + + /* + * Here we may meet with 3 situations: + * + * 1. If we find an already existing record, and the length + * is the same, cool, we just need to increase the r_count + * and it is OK. + * 2. If we find a hole, just insert it with r_count = 1. + * 3. If we are in the middle of one extent record, split + * it. + */ + if (rec.e_refcount && le32_to_cpu(rec.e_cpos) == cpos && + set_len <= len) { + mlog(0, "increase refcount rec, start %u, len %u, " + "count %u\n", cpos, set_len, + le32_to_cpu(rec.e_refcount)); + ret = ocfs2_incre_refcount_rec(handle, et->et_ci, + path, index); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (!rec.e_refcount) { + rec.e_refcount = cpu_to_le32(1); + rec.e_flags = OCFS2_EXT_REFCOUNT_RECORD; + + mlog(0, "insert refcount rec, start %u, len %u\n", + le32_to_cpu(rec.e_cpos), set_len); + ret = ocfs2_insert_extent(handle, et, &rec, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + set_len = min((u32)(cpos + len), + le32_to_cpu(rec.e_cpos) + set_len) - cpos; + rec.e_cpos = cpu_to_le32(cpos); + rec.e_leaf_clusters = cpu_to_le16(set_len); + le32_add_cpu(&rec.e_refcount, 1); + + mlog(0, "split efcount rec, start %u, len %u, " + "count %u\n", le32_to_cpu(rec.e_cpos), + set_len, le32_to_cpu(rec.e_refcount)); + ret = ocfs2_split_extent(handle, et, + path, index, + &rec, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += set_len; + len -= set_len; + ocfs2_reinit_path(path, 1); + } + +out: + ocfs2_free_path(path); + return ret; +} -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 14/42] ocfs2: Add support of decrementing refcount for delete.
Given a physical cpos and length, decrement the refcount in the tree. If the refcount for any portion of the extent goes to zero, that portion is queued for freeing. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 4 ++ 2 files changed, 119 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3b6f327..6b42ce8 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -563,3 +563,118 @@ out: ocfs2_free_path(path); return ret; } + +static int __ocfs2_decrease_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0, index = 0; + struct ocfs2_extent_rec rec; + unsigned int r_count = 0, r_len; + struct ocfs2_path *path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + mlog(0, "Tree owner %llu, decrease refcount start %u, len %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len); + + while (len) { + ret = ocfs2_get_refcount_rec(et->et_ci, path, cpos, len, + &rec, &index); + if (ret) { + mlog_errno(ret); + goto out; + } + + r_count = le32_to_cpu(rec.e_refcount); + BUG_ON(r_count == 0); + + r_len = min((u32)(cpos + len), le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters)) - cpos; + + /* + * Now decrease the refcount: + * + * 1. r_count == 1, remove it from refcount tree and queue + * clusters for free. + * 2. r_count > 2, split the record. + */ + if (le32_to_cpu(rec.e_refcount) == 1) { + ret = ocfs2_remove_extent(handle, et, cpos, r_len, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_cache_cluster_dealloc(dealloc, + ocfs2_clusters_to_blocks(sb, cpos), + r_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + rec.e_cpos = cpu_to_le32(cpos); + rec.e_leaf_clusters = cpu_to_le16(r_len); + + le32_add_cpu(&rec.e_refcount, -1); + + ret = ocfs2_split_extent(handle, et, path, index, + &rec, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += r_len; + len -= r_len; + ocfs2_reinit_path(path, 1); + } + +out: + ocfs2_free_path(path); + return ret; +} + +int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_bh = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_tree et; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di->i_refcount_loc); + + ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + le64_to_cpu(di->i_refcount_loc), + &ref_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_refcount_extent_tree(&et, INODE_CACHE(inode), ref_bh); + ret = __ocfs2_decrease_refcount(handle, &et, + cpos, len, meta_ac, dealloc); + if (ret) + mlog_errno(ret); +out: + brelse(ref_bh); + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9f4bdac..92fe116 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -23,4 +23,8 @@ int ocfs2_set_refcount_tree(struct inode *inode, u64 blkno); int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); +int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); #endif /* OCFS2_REFCOUNTTREE_H */ -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 15/42] ocfs2: Add functions for extents refcounted.
Add function ocfs2_mark_extent_refcounted which can mark an extent refcounted. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 12 ++++++------ fs/ocfs2/alloc.h | 6 ++++++ fs/ocfs2/ocfs2.h | 7 +++++++ fs/ocfs2/refcounttree.c | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 7221073..bbfb15a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5067,12 +5067,12 @@ out: return ret; } -static int ocfs2_change_extent_flag(handle_t *handle, - struct ocfs2_extent_tree *et, - u32 cpos, u32 len, u32 phys, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc, - int new_flags, int clear_flags) +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags) { int ret, index; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index c89197a..6ea9ecf 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -122,6 +122,12 @@ int ocfs2_mark_extent_written(struct inode *inode, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags); int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fb2d092..74db43f 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -431,6 +431,13 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 6b42ce8..73b49cd 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -678,3 +678,42 @@ out: brelse(ref_bh); return ret; } + +/* + * Mark the already-existing extent at cpos as refcounted for len clusters. + * This adds the refcount extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +static int ocfs2_mark_extent_refcounted(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, + u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", + inode->i_ino, cpos, len, phys); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + OCFS2_EXT_REFCOUNTED, 0); + if (ret) + mlog_errno(ret); + +out: + return ret; +} -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 16/42] ocfs2: Hook 'Decrement refcount for delete'.
Add 'Decrement refcount for delete' in to the normal truncate process. So for a refcounted extent record, call refcount rec decrementation instead of cluster free. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 47 +++++++++++++++++++++++++++++++++++------ fs/ocfs2/refcounttree.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 4 +++ 3 files changed, 97 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index bbfb15a..69a6356 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6592,7 +6592,7 @@ out: */ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, handle_t *handle, struct ocfs2_truncate_context *tc, - u32 clusters_to_del, u64 *delete_start) + u32 clusters_to_del, u64 *delete_start, u8 *flags) { int ret, i, index = path->p_tree_depth; u32 new_edge = 0; @@ -6602,6 +6602,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, struct ocfs2_extent_rec *rec; *delete_start = 0; + *flags = 0; while (index >= 0) { bh = path->p_node[index].bh; @@ -6689,6 +6690,7 @@ find_tail_record: *delete_start = le64_to_cpu(rec->e_blkno) + ocfs2_clusters_to_blocks(inode->i_sb, le16_to_cpu(rec->e_leaf_clusters)); + *flags = rec->e_flags; /* * If it's now empty, remove this record. @@ -6788,7 +6790,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, handle_t *handle, struct ocfs2_truncate_context *tc, - struct ocfs2_path *path) + struct ocfs2_path *path, + struct ocfs2_alloc_context *meta_ac) { int status; struct ocfs2_dinode *fe; @@ -6796,6 +6799,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, struct ocfs2_extent_list *el; struct buffer_head *last_eb_bh = NULL; u64 delete_blk = 0; + u8 rec_flags; fe = (struct ocfs2_dinode *) fe_bh->b_data; @@ -6851,7 +6855,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_trim_tree(inode, path, handle, tc, - clusters_to_del, &delete_blk); + clusters_to_del, &delete_blk, &rec_flags); if (status) { mlog_errno(status); goto bail; @@ -6883,8 +6887,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } if (delete_blk) { - status = ocfs2_truncate_log_append(osb, handle, delete_blk, - clusters_to_del); + if (rec_flags & OCFS2_EXT_REFCOUNTED) + status = ocfs2_decrease_refcount(inode, fe_bh, handle, + ocfs2_blocks_to_clusters(osb->sb, + delete_blk), + clusters_to_del, meta_ac, + &tc->tc_dealloc); + else + status = ocfs2_truncate_log_append(osb, handle, + delete_blk, + clusters_to_del); if (status < 0) { mlog_errno(status); goto bail; @@ -7312,6 +7324,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_path *path = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; mlog_entry_void(); @@ -7337,6 +7350,8 @@ start: goto bail; } + credits = 0; + /* * Truncate always works against the rightmost tree branch. */ @@ -7389,6 +7404,16 @@ start: mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); + if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED) { + status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh, + &credits, + &meta_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + mutex_lock(&tl_inode->i_mutex); tl_sem = 1; /* ocfs2_truncate_log_needs_flush guarantees us at least one @@ -7402,7 +7427,7 @@ start: } } - credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, + credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, (struct ocfs2_dinode *)fe_bh->b_data, el); handle = ocfs2_start_trans(osb, credits); @@ -7414,7 +7439,7 @@ start: } status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, - tc, path); + tc, path, meta_ac); if (status < 0) { mlog_errno(status); goto bail; @@ -7428,6 +7453,11 @@ start: ocfs2_reinit_path(path, 1); + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + /* * The check above will catch the case where we've truncated * away all allocation. @@ -7442,6 +7472,9 @@ bail: ocfs2_schedule_truncate_log_flush(osb, 1); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (tl_sem) mutex_unlock(&tl_inode->i_mutex); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 73b49cd..f32899d 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -717,3 +717,56 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, out: return ret; } + +/* + * For refcount tree, we will decrease some contiguous clusters + * refcount count, since these clusters are contiguous, we will + * at most split 2 extents(the first and the last). And there is + * nothing to do with how much clusters we will handle. + */ +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + struct buffer_head *di_bh, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, extents_split = 2; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_extent_tree et; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di->i_refcount_loc); + + ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + le64_to_cpu(di->i_refcount_loc), + &ref_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_bh->b_data; + ocfs2_init_refcount_extent_tree(&et, INODE_CACHE(inode), ref_bh); + + ret = ocfs2_lock_allocators(inode, &et, 0, extents_split, + NULL, meta_ac); + if (ret < 0) + mlog_errno(ret); + + *credits += ocfs2_calc_extend_credits(inode->i_sb, et.et_root_el, + extents_split); + +out: + brelse(ref_bh); + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 92fe116..9b646c5 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -27,4 +27,8 @@ int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + struct buffer_head *di_bh, + int *credits, + struct ocfs2_alloc_context **meta_ac); #endif /* OCFS2_REFCOUNTTREE_H */ -- 1.6.2.rc2.16.gf474c
When replacing the old refcounted extent record, I don't remove the old extent record first and then insert the new one. Because during the tree manipulation(e.g, ocfs2_remove_extent), we often need to call ocfs2_extend_trans which may restart our transcation. So if we crash right after the removing and before the inserting, we will lost the data. So the whole process will be: 1. If we are replacing the whole extent record, just copy the data and replace e_blkno. 2. If we are split the extent record, just initialized the data and then call ocfs2_split_extent directly, the tree code is modified so that it can handle it. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 189 +++++++++++++- fs/ocfs2/alloc.h | 10 + fs/ocfs2/aops.c | 49 ++++- fs/ocfs2/refcounttree.c | 662 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 2 + 5 files changed, 900 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 69a6356..19f80ef 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -582,6 +582,170 @@ static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) return ret; } +static int ocfs2_leaf_eb_num(struct super_block *sb, + struct ocfs2_extent_tree *et, int *num_ebs) +{ + int i, ret = 0; + int ebs = 0, recs_per_eb = ocfs2_extent_recs_per_eb(sb); + struct ocfs2_path *path; + + *num_ebs = 0; + + if (!et->et_root_el->l_tree_depth) + return 0; + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + ret = ocfs2_find_path(et->et_ci, path, UINT_MAX); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < path->p_tree_depth; i++) + ebs = ebs * recs_per_eb + + le16_to_cpu(path->p_node[i].el->l_next_free_rec); + + *num_ebs = ebs; +out: + ocfs2_free_path(path); + return ret; +} + +/* + * Calculate how much credits we need for adding meta_add blocks. + * It is only used when we want to allocate a lot of metas at one time. + */ +int ocfs2_calc_create_meta_credits(struct super_block *sb, + int meta_add) +{ + int max_sub_alloc = 0, max_group; + int bitmap_blocks; + + /* + * calculate the global_bitmap first. It include all the group + * and the dinode and one more group if we relink. + */ + max_group = meta_add / (OCFS2_SB(sb)->bitmap_cpg * 8) + 1; + bitmap_blocks = 1 + max_group * 2; + + /* calculate the local alloc file. */ + if (OCFS2_SB(sb)->local_alloc_bits) + max_sub_alloc = meta_add / OCFS2_SB(sb)->local_alloc_bits + 1; + + mlog(0, "bitmap %d, max_sub %d, meta_add %d\n", bitmap_blocks, + max_sub_alloc, meta_add); + + return bitmap_blocks + max_sub_alloc + meta_add + 1 + + ocfs2_quota_trans_credits(sb); +} + +/* + * Calculate if we add num_recs to ocfs2_extent_tree how much meta and credits + * we need at most. Useful when we will add a lot of recs in the tree. + */ +int ocfs2_calc_tree_change_need(struct super_block *sb, + struct ocfs2_extent_tree *et, int num_recs, + int *meta_add, int *credits) +{ + int ret = 0, i, tree_depth; + int num_ebs, leaf_eb_need, max_leaf_eb, max_empty_eb; + int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + int recs_per_eb = ocfs2_extent_recs_per_eb(sb); + + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + *credits += ocfs2_calc_extend_credits(sb, + et->et_root_el, + num_recs); + + if (num_free_extents >= num_recs) + return ret; + + /* We have to change the tree(branch_add or tree_shift). */ + leaf_eb_need = num_recs / recs_per_eb + 1; + + ret = ocfs2_leaf_eb_num(sb, et, &num_ebs); + if (ret) { + mlog_errno(ret); + goto out; + } + + tree_depth = le16_to_cpu(et->et_root_el->l_tree_depth); + max_leaf_eb = le16_to_cpu(et->et_root_el->l_count); + + /* + * tree_depth = 0 is a special case, we have to copy all the extent + * record from root to the 1st extent block and the num of the new + * added leaf extent block is limit to l_count, not recs_per_eb. + */ + if (!tree_depth) { + max_empty_eb = max_leaf_eb - 1; + goto shift_tree; + } + + for (i = 0; i < tree_depth - 1; i++) + max_leaf_eb *= recs_per_eb; + + /* + * The empty leaf eb we could have with branch add. + */ + max_empty_eb = max_leaf_eb - num_ebs; + + if (leaf_eb_need <= max_empty_eb) { + /* We could have enough meta by just adding branch. */ + *meta_add += tree_depth * leaf_eb_need; + goto out; + } + + /* + * We have to shift the tree now. + * calculate the empty eb we could have after one tree shift. + */ + max_leaf_eb = max_leaf_eb * recs_per_eb; + max_empty_eb = max_leaf_eb - num_ebs; + +shift_tree: + if (max_leaf_eb <= max_empty_eb) { + /* OK, just one tree shift can have enough space for us. */ + *meta_add += (tree_depth + 1) * leaf_eb_need; + goto out; + } + + /* sorry, we have to shift the tree twice. */ + max_leaf_eb = max_leaf_eb * recs_per_eb; + max_empty_eb = max_leaf_eb - num_ebs; + + /* + * Even with a tree shift twice, we can't handle this. + * We'd better BUG_OUT here since the meta allocators + * may not handle this also. + */ + mlog_bug_on_msg(max_leaf_eb > max_empty_eb, "extent tree %llu wants to " + "add %d new extent recs, depth %u, count %u, next free %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + num_recs, le16_to_cpu(et->et_root_el->l_tree_depth), + le16_to_cpu(et->et_root_el->l_count), + le16_to_cpu(et->et_root_el->l_next_free_rec)); + + *meta_add += (tree_depth + 2) * leaf_eb_need; + +out: + if (!ret) + *credits += ocfs2_calc_create_meta_credits(sb, *meta_add); + + return ret; +} + static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); @@ -6916,9 +7080,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) return 0; } -static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys) +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys) { int ret, partial = 0; @@ -6986,20 +7150,16 @@ out: ocfs2_unlock_and_free_pages(pages, numpages); } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) { int numpages, ret = 0; - struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned long index; loff_t last_page_bytes; BUG_ON(start > end); - BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !- (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); - numpages = 0; last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; @@ -7027,6 +7187,17 @@ out: return ret; } +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + struct super_block *sb = inode->i_sb; + + BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !+ (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); + + return ocfs2_grab_pages(inode, start, end, pages, num); +} + /* * Zero the area past i_size but still within an allocated * cluster. This avoids exposing nonzero data on subsequent file diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 6ea9ecf..0fb954a 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -269,6 +269,11 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) return !rec->e_leaf_clusters; } +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num); +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys); /* * Structures which describe a path through a btree, and functions to * manipulate them. @@ -310,4 +315,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle, int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, handle_t *handle, struct ocfs2_path *path); +int ocfs2_calc_create_meta_credits(struct super_block *sb, + int meta_add); +int ocfs2_calc_tree_change_need(struct super_block *sb, + struct ocfs2_extent_tree *et, int num_recs, + int *meta_add, int *credits); #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 2e71729..3ad77f3 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -44,6 +44,7 @@ #include "suballoc.h" #include "super.h" #include "symlink.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -1410,18 +1411,29 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, } } +static inline void ocfs2_clear_write_desc(struct ocfs2_write_ctxt *wc) +{ + memset(&wc->w_desc, 0, + sizeof(struct ocfs2_write_cluster_desc) * wc->w_clen); +} /* * Populate each single-cluster write descriptor in the write context * with information about the i/o to be done. + * If we encountered a refcounted cluster, break the process and return + * the refcounted start cpos. * * Returns the number of clusters that will have to be allocated, as * well as a worst case estimate of the number of extent records that * would have to be created during a write to an unwritten region. + * + * If we find a refcounted record, return directly with refcounted_cpos + * set as the position. */ static int ocfs2_populate_write_desc(struct inode *inode, struct ocfs2_write_ctxt *wc, unsigned int *clusters_to_alloc, - unsigned int *extents_to_split) + unsigned int *extents_to_split, + unsigned int *refcounted_cpos) { int ret; struct ocfs2_write_cluster_desc *desc; @@ -1432,6 +1444,7 @@ static int ocfs2_populate_write_desc(struct inode *inode, *clusters_to_alloc = 0; *extents_to_split = 0; + *refcounted_cpos = UINT_MAX; for (i = 0; i < wc->w_clen; i++) { desc = &wc->w_desc[i]; @@ -1448,6 +1461,11 @@ static int ocfs2_populate_write_desc(struct inode *inode, goto out; } + if (ext_flags & OCFS2_EXT_REFCOUNTED) { + *refcounted_cpos = desc->c_cpos; + ret = -ETXTBSY; + goto out; + } /* * Assume worst case - that we're writing in * the middle of the extent. @@ -1652,6 +1670,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; struct ocfs2_extent_tree et; + unsigned int refcounted_cpos, write_len; ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { @@ -1679,12 +1698,36 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, - &extents_to_split); - if (ret) { + &extents_to_split, &refcounted_cpos); + if (ret && ret != -ETXTBSY) { mlog_errno(ret); goto out; } + if (ret == -ETXTBSY) { + BUG_ON(refcounted_cpos == UINT_MAX); + write_len = wc->w_clen - (refcounted_cpos - wc->w_cpos); + + ret = ocfs2_refcount_cow(inode, di_bh, + refcounted_cpos, write_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* reinitialize write_desc and populate it again. */ + ocfs2_clear_write_desc(wc); + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, + &extents_to_split, + &refcounted_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(refcounted_cpos != UINT_MAX); + } + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; /* diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index f32899d..22ec27d 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -30,6 +30,21 @@ #include "sysfile.h" #include "dlmglue.h" #include "extent_map.h" +#include "aops.h" + +struct ocfs2_cow_context { + struct inode *inode; + struct ocfs2_extent_tree di_et; + struct ocfs2_extent_tree ref_et; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct buffer_head **bhs; + struct page **cow_pages; + int num_pages; + u32 cow_start; + u32 cow_len; +}; static int ocfs2_validate_refcount_block(struct super_block *sb, struct buffer_head *bh) @@ -770,3 +785,650 @@ out: brelse(ref_bh); return ret; } + +#define MAX_COW_BYTES 1048576 +/* + * Calculate out the start and number of virtual clusters we need to to CoW. + * + * cpos is vitual start cluster position we want to do CoW in a + * file and write_len is the cluster length. + * + * Normal we will start CoW from the beginning of extent record cotaining cpos. + * And We will try to Cow as much clusters as we can until we reach + * MAX_COW_BYTES. If the write_len is larger than MAX_COW_BYTES, we will + * use that value as the maximum clusters. + */ +static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, + u32 write_len, + u32 *cow_start, + u32 *cow_len, + int *num_recs, + int *has_data) +{ + int ret = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_extent_list *el = &di->id2.i_list; + int tree_height = le16_to_cpu(el->l_tree_depth), i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb = NULL; + struct ocfs2_extent_rec *rec; + int max_clusters = ocfs2_clusters_for_bytes(inode->i_sb, MAX_COW_BYTES); + int leaf_clusters, rec_end = 0; + + max_clusters = max_clusters < write_len ? write_len : max_clusters; + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + *cow_len = 0; + *num_recs = 0; + *has_data = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec);) { + rec = &el->l_recs[i]; + i++; + + if (ocfs2_is_empty_extent(rec)) { + mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + "index %d\n", inode->i_ino, i); + + continue; + } + + if (le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters) <= cpos) + continue; + + if (*cow_len == 0) { + BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); + *cow_start = le32_to_cpu(rec->e_cpos); + rec_end = le32_to_cpu(rec->e_cpos); + } + + if (!*has_data && !(rec->e_flags & OCFS2_EXT_UNWRITTEN)) + *has_data = 1; + + /* + * If we encounter a hole or a non-refcounted record, + * stop the search. + */ + if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || + rec_end != le32_to_cpu(rec->e_cpos)) + break; + + *num_recs += 1; + leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); + rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; + + if (*cow_len + leaf_clusters >= max_clusters) { + if (*cow_len == 0) { + /* + * cpos is in a very large extent record. + * So just split max_clusters from the + * extent record. + */ + leaf_clusters = rec_end - cpos; + + if (leaf_clusters > max_clusters) + *cow_start = cpos; + else + *cow_start = rec_end - max_clusters; + } + *cow_len = max_clusters; + break; + } else + *cow_len += leaf_clusters; + + /* + * If we reach the end of the extent block and don't get enough + * clusters, continue with the next extent block if possible. + */ + if (i == le16_to_cpu(el->l_next_free_rec) && + eb && eb->h_next_leaf_blk) { + brelse(eb_bh); + eb_bh = NULL; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), + le64_to_cpu(eb->h_next_leaf_blk), + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + i = 0; + } + } + +out: + brelse(eb_bh); + return ret; +} + +/* + * Prepare the data_ac and meta_ac for all the tree modification, + * including both di_tree and refcount_tree. + * + * cow_len is the cluster length we will do cow. + * since we may split the old tree, so we at most will need cow_len + 2 + * extent recs. + */ +static int ocfs2_lock_refcount_cow_allocator(struct super_block *sb, + u32 cow_len, + struct ocfs2_cow_context *context, + int *credits) +{ + int ret = 0, meta_add = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + *credits = 0; + + ret = ocfs2_calc_tree_change_need(sb, &context->di_et, cow_len + 2, + &meta_add, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_calc_tree_change_need(sb, &context->ref_et, cow_len + 2, + &meta_add, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", + meta_add, cow_len, *credits); + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, + &context->meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reserve_clusters(osb, cow_len, &context->data_ac); + if (ret < 0) + mlog_errno(ret); + +out: + if (ret) { + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_duplicate_clusters_in_large_page(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_cow_context *context, + u32 cpos, u32 len, + u64 old_block, u64 new_block) +{ + int ret = 0; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + int i, bpc = ocfs2_clusters_to_blocks(sb, 1); + struct ocfs2_super *osb = OCFS2_SB(sb); + int cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + int page_start = context->cow_start / cpp; + int cow_page, cow_len, cp_len; + u64 phys; + struct page *page; + void *kaddr; + unsigned int from, cp_from, to; + + while (len) { + phys = new_block; + cow_page = cpos / cpp; + cow_len = cpp - cpos % cpp; + cp_len = 0; + if (cow_len > len) + cow_len = len; + + page = context->cow_pages[cow_page - page_start]; + cp_from = from = (cpos % cpp) << osb->s_clustersize_bits; + to = from + (cow_len << osb->s_clustersize_bits); + + while (cp_len < cow_len) { + ret = ocfs2_read_blocks(ci, old_block, bpc, + context->bhs, 0, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + kaddr = kmap_atomic(page, KM_USER0); + for (i = 0; i < bpc; i++) + memcpy(kaddr + cp_from + i * sb->s_blocksize, + context->bhs[i]->b_data, + sb->s_blocksize); + kunmap_atomic(kaddr, KM_USER0); + + for (i = 0; i < bpc; i++) { + brelse(context->bhs[i]); + context->bhs[i] = NULL; + } + + cpos++; + cp_len++; + old_block += bpc; + cp_from += osb->s_clustersize; + } + + ocfs2_map_and_dirty_page(context->inode, + handle, from, to, + page, 0, &phys); + + len -= cow_len; + new_block += bpc * cow_len; + } + +out: + return ret; +} + +static int ocfs2_duplicate_clusters(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0, bh_num; + struct ocfs2_caching_info *ci = context->di_et.et_ci; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + int i, j, bpc = ocfs2_clusters_to_blocks(sb, 1); + u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); + u64 phys, new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct ocfs2_super *osb = OCFS2_SB(sb); + int page_start, ppc = ocfs2_pages_per_cluster(sb); + int bpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); + struct page *page; + void *kaddr; + unsigned int from; + + mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, + new_cluster, new_len, cpos); + + if (osb->s_clustersize_bits >= PAGE_CACHE_SHIFT) { + /* + * Page size is less than cluster size, so we just need + * to write all the pages in the new clusters. + */ + while (new_len) { + phys = new_block; + ret = ocfs2_read_blocks(ci, old_block, bpc, + context->bhs, 0, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + bh_num = 0; + + page_start = (cpos - context->cow_start) * ppc; + from = cpos << osb->s_clustersize_bits; + + for (i = 0; i < ppc; i++, from += PAGE_CACHE_SIZE) { + page = context->cow_pages[page_start + i]; + + kaddr = kmap_atomic(page, KM_USER0); + for (j = 0; j < bpp; j++, bh_num++) + memcpy(kaddr + j * sb->s_blocksize, + context->bhs[bh_num]->b_data, + sb->s_blocksize); + kunmap_atomic(kaddr, KM_USER0); + + ocfs2_map_and_dirty_page(context->inode, + handle, 0, + PAGE_CACHE_SIZE, + page, 0, &phys); + } + + for (i = 0; i < bpc; i++) { + brelse(context->bhs[i]); + context->bhs[i] = NULL; + } + + new_len--; + cpos++; + old_block += bpc; + new_block += bpc; + } + } else { + ret = ocfs2_duplicate_clusters_in_large_page(handle, ci, + context, + cpos, new_len, + old_block, + new_block); + if (ret) + mlog_errno(ret); + } + +out: + return ret; +} + +static int ocfs2_clear_ext_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 p_cluster, u32 len, + unsigned int ext_flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + struct ocfs2_extent_rec replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 ino = ocfs2_metadata_cache_owner(et->et_ci); + + mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", + (unsigned long long)ino, cpos, len, p_cluster, ext_flags); + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, + p_cluster)); + replace_rec.e_flags = ext_flags; + replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; + + path = ocfs2_new_path_from_et(et); + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + ret = ocfs2_split_extent(handle, et, path, index, + &replace_rec, meta_ac, dealloc); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_replace_clusters(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old, + u32 new, u32 len, + unsigned int ext_flags) +{ + int ret; + struct ocfs2_caching_info *ci = context->di_et.et_ci; + u64 ino = ocfs2_metadata_cache_owner(ci); + + mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", + (unsigned long long)ino, cpos, old, new, len, ext_flags); + + /*If the old clusters is unwritten, no need to duplicate. */ + if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = ocfs2_duplicate_clusters(handle, context, cpos, + old, new, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_clear_ext_refcount(handle, &context->di_et, + cpos, new, len, ext_flags, + context->meta_ac, &context->dealloc); +out: + return ret; +} + +static int ocfs2_make_clusters_writable(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 p_cluster, + u32 num_clusters, unsigned int e_flags) +{ + int ret; + u32 new_bit, new_len; + struct ocfs2_caching_info *inode_ci = context->di_et.et_ci; + struct ocfs2_super *osb + OCFS2_SB(ocfs2_metadata_cache_get_super(inode_ci)); + + while (num_clusters) { + ret = __ocfs2_claim_clusters(osb, handle, context->data_ac, + 1, num_clusters, + &new_bit, &new_len); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_replace_clusters(handle, context, + cpos, p_cluster, new_bit, + new_len, e_flags); + if (ret) { + mlog_errno(ret); + break; + } + + cpos += new_len; + num_clusters -= new_len; + } + + return ret; +} + +static int ocfs2_replace_cow(struct inode *inode, + struct buffer_head *di_bh, + struct buffer_head *ref_bh, + u32 cow_start, u32 cow_len, + int num_recs, + struct page **pages, + int num_pages) +{ + int ret, credits; + u32 p_cluster, num_clusters, start = cow_start; + unsigned int ext_flags; + handle_t *handle = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cow_context context; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + return -EROFS; + } + + memset(&context, 0, sizeof(context)); + + context.inode = inode; + context.cow_pages = pages; + context.num_pages = num_pages; + context.cow_start = cow_start; + context.cow_len = cow_len; + + context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1), + sizeof(struct buffer_head *), GFP_NOFS); + if (!context.bhs) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + ocfs2_init_dealloc_ctxt(&context.dealloc); + ocfs2_init_dinode_extent_tree(&context.di_et, + INODE_CACHE(inode), di_bh); + ocfs2_init_refcount_extent_tree(&context.ref_et, + INODE_CACHE(inode), ref_bh); + + ret = ocfs2_lock_refcount_cow_allocator(inode->i_sb, cow_len, + &context, &credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* + * We also need the credits for removing extents for both di-tree and + * refcount tree and the copy of data. + */ + credits += ocfs2_remove_extent_credits(inode->i_sb) * num_recs * 2; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (cow_len) { + ret = ocfs2_get_clusters(inode, cow_start, &p_cluster, + &num_clusters, &ext_flags); + + BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); + + if (cow_len < num_clusters) + num_clusters = cow_len; + + ret = ocfs2_make_clusters_writable(handle, &context, cow_start, + p_cluster, num_clusters, + ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + ret = __ocfs2_decrease_refcount(handle, &context.ref_et, + p_cluster, num_clusters, + context.meta_ac, + &context.dealloc); + if (ret) { + mlog_errno(ret); + break; + } + + cow_len -= num_clusters; + cow_start += num_clusters; + } + + + /* + * truncate the extent map here since no matter whether we meet with + * any error during the action, we shouldn't trust cached extent map + * any more. + */ + ocfs2_extent_map_trunc(inode, start); + ocfs2_commit_trans(osb, handle); + +out: + if (context.data_ac) + ocfs2_free_alloc_context(context.data_ac); + if (context.meta_ac) + ocfs2_free_alloc_context(context.meta_ac); + + if (ocfs2_dealloc_has_cluster(&context.dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context.dealloc); + } + + kfree(context.bhs); + return ret; +} + +int ocfs2_refcount_cow(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len) +{ + int ret, num_recs = 0, has_data = 0, num_pages = 0; + u32 cow_start = 0, cow_len = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *ref_bh = NULL; + struct page **pages = NULL; + loff_t start, end; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di->i_refcount_loc); + + ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len, + &cow_start, &cow_len, + &num_recs, &has_data); + if (ret) { + mlog_errno(ret); + goto out; + } + mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " + "cow_len %u, num_recs %d\n", inode->i_ino, + cpos, write_len, cow_start, cow_len, num_recs); + + BUG_ON(cow_len == 0 || num_recs == 0); + + if (has_data) { + pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb) * cow_len, + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + start = cow_start << OCFS2_SB(inode->i_sb)->s_clustersize_bits; + end = start + + (cow_len << OCFS2_SB(inode->i_sb)->s_clustersize_bits); + ret = ocfs2_grab_pages(inode, start, end, pages, &num_pages); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + le64_to_cpu(di->i_refcount_loc), + &ref_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_replace_cow(inode, di_bh, ref_bh, + cow_start, cow_len, num_recs, + pages, num_pages); + if (ret) + mlog_errno(ret); + +out: + if (pages) { + ocfs2_unlock_and_free_pages(pages, num_pages); + kfree(pages); + } + brelse(ref_bh); + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9b646c5..1b6e4d6 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -31,4 +31,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, struct buffer_head *di_bh, int *credits, struct ocfs2_alloc_context **meta_ac); +int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, + u32 cpos, u32 write_len); #endif /* OCFS2_REFCOUNTTREE_H */ -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:45 UTC
[Ocfs2-devel] [PATCH 18/42] ocfs2: CoW refcount tree improvement.
During CoW, if the old extent record is refcounted, we allocate som new clusters and do CoW. Actually we can have some improvement here. If the old extent has refcount=1, that means now it is only used by this file. So we don't need to allocate new clusters, just remove the refcounted flag and it is OK. We also have to remove it from the refcount tree while not deleting it. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/refcounttree.c | 118 +++++++++++++++++++++++++++++++++++------------ fs/ocfs2/refcounttree.h | 3 +- 3 files changed, 92 insertions(+), 31 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 19f80ef..5a88705 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7056,7 +7056,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, ocfs2_blocks_to_clusters(osb->sb, delete_blk), clusters_to_del, meta_ac, - &tc->tc_dealloc); + &tc->tc_dealloc, 1); else status = ocfs2_truncate_log_append(osb, handle, delete_blk, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 22ec27d..1ae016c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -583,7 +583,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) { int ret = 0, index = 0; struct ocfs2_extent_rec rec; @@ -598,9 +599,10 @@ static int __ocfs2_decrease_refcount(handle_t *handle, goto out; } - mlog(0, "Tree owner %llu, decrease refcount start %u, len %u\n", + mlog(0, "Tree owner %llu, decrease refcount start %u, " + "len %u, delete %u\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), - cpos, len); + cpos, len, delete); while (len) { ret = ocfs2_get_refcount_rec(et->et_ci, path, cpos, len, @@ -612,6 +614,8 @@ static int __ocfs2_decrease_refcount(handle_t *handle, r_count = le32_to_cpu(rec.e_refcount); BUG_ON(r_count == 0); + if (!delete) + BUG_ON(r_count > 1); r_len = min((u32)(cpos + len), le32_to_cpu(rec.e_cpos) + le16_to_cpu(rec.e_leaf_clusters)) - cpos; @@ -631,12 +635,14 @@ static int __ocfs2_decrease_refcount(handle_t *handle, goto out; } - ret = ocfs2_cache_cluster_dealloc(dealloc, + if (delete) { + ret = ocfs2_cache_cluster_dealloc(dealloc, ocfs2_clusters_to_blocks(sb, cpos), - r_len); - if (ret) { - mlog_errno(ret); - goto out; + r_len); + if (ret) { + mlog_errno(ret); + goto out; + } } } else { rec.e_cpos = cpu_to_le32(cpos); @@ -665,7 +671,8 @@ out: int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) { int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -686,7 +693,7 @@ int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, ocfs2_init_refcount_extent_tree(&et, INODE_CACHE(inode), ref_bh); ret = __ocfs2_decrease_refcount(handle, &et, - cpos, len, meta_ac, dealloc); + cpos, len, meta_ac, dealloc, delete); if (ret) mlog_errno(ret); out: @@ -1215,33 +1222,95 @@ static int ocfs2_make_clusters_writable(handle_t *handle, u32 cpos, u32 p_cluster, u32 num_clusters, unsigned int e_flags) { - int ret; - u32 new_bit, new_len; + int ret, delete, index; + u32 new_bit, new_len, r_end; + unsigned int set_len; struct ocfs2_caching_info *inode_ci = context->di_et.et_ci; struct ocfs2_super *osb OCFS2_SB(ocfs2_metadata_cache_get_super(inode_ci)); + struct ocfs2_path *path; + struct ocfs2_extent_rec rec; + + mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", + cpos, p_cluster, num_clusters, e_flags); + + path = ocfs2_new_path_from_et(&context->ref_et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } while (num_clusters) { - ret = __ocfs2_claim_clusters(osb, handle, context->data_ac, - 1, num_clusters, - &new_bit, &new_len); + ret = ocfs2_get_refcount_rec(context->ref_et.et_ci, + path, p_cluster, + num_clusters, &rec, &index); if (ret) { mlog_errno(ret); break; } - ret = ocfs2_replace_clusters(handle, context, - cpos, p_cluster, new_bit, - new_len, e_flags); + BUG_ON(!rec.e_refcount); + r_end = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + set_len = min(p_cluster + num_clusters, r_end) - p_cluster; + + /* + * There are many different situation here. + * 1. If refcount == 1, remove the flag and do COW no delete. + * 2. If refcount > 1, allocate clusters. + * Here we may not allocate r_len once at a time, so continue + * until we reach num_clusters. + */ + if (le32_to_cpu(rec.e_refcount) == 1) { + delete = 0; + ret = ocfs2_clear_ext_refcount(handle, &context->di_et, + cpos, p_cluster, + set_len, e_flags, + context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + break; + } + } else { + delete = 1; + + ret = __ocfs2_claim_clusters(osb, handle, + context->data_ac, + 1, set_len, + &new_bit, &new_len); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_replace_clusters(handle, context, + cpos, p_cluster, new_bit, + new_len, e_flags); + if (ret) { + mlog_errno(ret); + break; + } + set_len = new_len; + } + + ret = __ocfs2_decrease_refcount(handle, &context->ref_et, + p_cluster, set_len, + context->meta_ac, + &context->dealloc, delete); if (ret) { mlog_errno(ret); break; } - cpos += new_len; - num_clusters -= new_len; + p_cluster += set_len; + cpos += set_len; + num_clusters -= set_len; + ocfs2_reinit_path(path, 1); } + ocfs2_free_path(path); return ret; } @@ -1326,15 +1395,6 @@ static int ocfs2_replace_cow(struct inode *inode, break; } - ret = __ocfs2_decrease_refcount(handle, &context.ref_et, - p_cluster, num_clusters, - context.meta_ac, - &context.dealloc); - if (ret) { - mlog_errno(ret); - break; - } - cow_len -= num_clusters; cow_start += num_clusters; } diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 1b6e4d6..b01a50e 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -26,7 +26,8 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc); + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete); int ocfs2_prepare_refcount_change_for_del(struct inode *inode, struct buffer_head *di_bh, int *credits, -- 1.6.2.rc2.16.gf474c
ioctl will call __ocfs2_reflink. And it will: 1. Create a new refcount tree to the old file if it doesn't have one and insert all the extent records to the tree if they are not refcounted. 2. Insert all the extent records to the new inode's extent list. 3. Increase the r_count for all the items in the refcount tree. Note: This patch use ocfs2_mknod to create a new file under the destination directory, it isn't safe. The next a few patches will try to create the first first in orphan dir, reflink and then move it to the destination directory. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/namei.c | 6 +- fs/ocfs2/namei.h | 2 + fs/ocfs2/refcounttree.c | 382 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 386 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2615cb9..7a309bd 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -217,10 +217,8 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) return inode; } -static int ocfs2_mknod(struct inode *dir, - struct dentry *dentry, - int mode, - dev_t dev) +int ocfs2_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t dev) { int status = 0; struct buffer_head *parent_fe_bh = NULL; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 688aef6..7f9cc46 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -35,5 +35,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct inode *orphan_dir_inode, struct inode *inode, struct buffer_head *orphan_dir_bh); +int ocfs2_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t dev); #endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1ae016c..7b8ffca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -31,6 +31,7 @@ #include "dlmglue.h" #include "extent_map.h" #include "aops.h" +#include "namei.h" struct ocfs2_cow_context { struct inode *inode; @@ -1492,3 +1493,384 @@ out: brelse(ref_bh); return ret; } + +/* + * Insert a new extent into refcount tree and mark a extent rec + * as refcounted in the dinode tree. + */ +static int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *di_et, + struct ocfs2_extent_tree *ref_et, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + handle_t *handle; + int credits = 1; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_allocators(inode, ref_et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + credits += ocfs2_calc_extend_credits(inode->i_sb, + ref_et->et_root_el, 0); + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_mark_extent_refcounted(inode, di_et, handle, + cpos, num_clusters, p_cluster, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_increase_refcount(handle, ref_et, + p_cluster, num_clusters, + meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh) +{ + int ret; + struct buffer_head *ref_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + unsigned int ext_flags; + loff_t size; + u32 cpos, num_clusters, clusters, p_cluster; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree di_et; + struct ocfs2_extent_tree ref_et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + ret = ocfs2_create_refcount_tree(inode, fe_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + BUG_ON(!di->i_refcount_loc); + ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + le64_to_cpu(di->i_refcount_loc), + &ref_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), fe_bh); + ocfs2_init_refcount_extent_tree(&ref_et, INODE_CACHE(inode), ref_bh); + + size = i_size_read(inode); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + cpos += num_clusters; + if ((ext_flags & OCFS2_EXT_REFCOUNTED) || !p_cluster) + continue; + + ret = ocfs2_add_refcount_flag(inode, &di_et, &ref_et, + cpos - num_clusters, + p_cluster, num_clusters, + &dealloc); + if (ret) { + mlog_errno(ret); + break; + } + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode, 0); + brelse(ref_bh); +out: + + if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(OCFS2_SB(inode->i_sb), 1); + ocfs2_run_deallocs(OCFS2_SB(inode->i_sb), &dealloc); + } + return ret; +} + +static int ocfs2_duplicate_extent_list(struct inode *s_inode, + struct inode *t_inode, + handle_t *handle, + struct buffer_head *s_bh, + struct buffer_head *t_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *ref_et) +{ + int ret = 0; + u32 p_cluster, num_clusters, clusters, cpos; + loff_t size; + unsigned int ext_flags; + struct ocfs2_extent_tree et; + struct ocfs2_extent_rec rec; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *)t_bh->b_data; + + memset(&rec, 0, sizeof(rec)); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); + + size = i_size_read(s_inode); + clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster) { + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(cpos); + rec.e_int_clusters = cpu_to_le32(num_clusters); + rec.e_blkno = cpu_to_le64( + ocfs2_clusters_to_blocks(s_inode->i_sb, + p_cluster)); + rec.e_flags = ext_flags; + + ret = ocfs2_insert_extent(handle, &et, &rec, meta_ac); + if (ret) { + mlog_errno(ret); + break; + } + + ret = __ocfs2_increase_refcount(handle, ref_et, + p_cluster, num_clusters, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + break; + } + } + + cpos += num_clusters; + } + + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + i_size_write(t_inode, size); + fe->i_size = cpu_to_le64(size); + fe->i_clusters = cpu_to_le32(OCFS2_I(s_inode)->ip_clusters); + + ocfs2_journal_dirty(handle, t_bh); +out: + return ret; +} + +/* + * Calculate how much meta and credits we need for relink s_inode. + * + * We will iterate the s_inode's extent tree and calculate the extent + * record number. We will at most need num_recs * 2 for refcount tree + * in case each one will cause a split. + * the iteration will be very fast since all the extent records have + * already been inserted into extent map by ocfs2_attach_refcount_tree. + */ +static int ocfs2_calc_refcount_credits(struct inode *s_inode, + struct buffer_head *s_bh, + struct ocfs2_extent_tree *ref_et, + int *meta_add, int *credits) +{ + int ret = 0, num_recs = 0; + u32 p_cluster, num_clusters, clusters, cpos; + loff_t size; + + size = i_size_read(s_inode); + clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, + &num_clusters, NULL); + + if (p_cluster) + num_recs++; + + cpos += num_clusters; + } + + num_recs *= 2; + + /* Calculate how many meta we need according to num_recs. */ + ret = ocfs2_calc_tree_change_need(s_inode->i_sb, ref_et, num_recs, + meta_add, credits); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_create_reflink_node(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + int ret, credits, meta_add; + handle_t *handle; + struct buffer_head *ref_bh = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_refcount_block *rb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_extent_list *el; + struct ocfs2_extent_tree ref_et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(di->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(INODE_CACHE(t_inode), + le64_to_cpu(di->i_refcount_loc), + &ref_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + ocfs2_init_refcount_extent_tree(&ref_et, INODE_CACHE(t_inode), ref_bh); + rb = (struct ocfs2_refcount_block *)ref_bh->b_data; + + el = &di->id2.i_list; + + meta_add = le16_to_cpu(el->l_next_free_rec) * + le16_to_cpu(el->l_tree_depth); + + credits = ocfs2_calc_create_meta_credits(s_inode->i_sb, meta_add); + + ret = ocfs2_calc_refcount_credits(s_inode, s_bh, &ref_et, + &meta_add, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "meta_add = %d, credits = %d\n", meta_add, credits); + + if (meta_add) { + ret = ocfs2_reserve_new_metadata_blocks(osb, + meta_add, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_free_resource; + } + + ret = ocfs2_duplicate_extent_list(s_inode, t_inode, handle, + s_bh, t_bh, meta_ac, + &dealloc, &ref_et); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out_free_resource: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + brelse(ref_bh); + + return ret; +} + +static int __ocfs2_reflink(struct dentry *old_dentry, + struct buffer_head *old_bh, + struct inode *dir, + struct dentry *dentry) +{ + int ret; + struct inode *inode = old_dentry->d_inode; + struct inode *new_inode; + struct buffer_head *new_bh = NULL; + + mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, + old_dentry->d_name.len, old_dentry->d_name.name, + dentry->d_name.len, dentry->d_name.name); + + ret = ocfs2_mknod(dir, dentry, inode->i_mode, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_attach_refcount_tree(inode, old_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_inode = dentry->d_inode; + + mutex_lock(&new_inode->i_mutex); + ret = ocfs2_inode_lock(new_inode, &new_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh); + if (ret) + mlog_errno(ret); + + ocfs2_inode_unlock(new_inode, 1); + brelse(new_bh); +out_unlock: + mutex_unlock(&new_inode->i_mutex); +out: + mlog_exit(ret); + + return ret; +} -- 1.6.2.rc2.16.gf474c
The ioctl will take 2 parameters: old_path and new_path and works like link. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/ioctl.c | 12 ++++ fs/ocfs2/ocfs2_fs.h | 8 +++ fs/ocfs2/refcounttree.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 3 + 4 files changed, 170 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 9fcd36d..4f189b7 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -22,6 +22,7 @@ #include "ocfs2_fs.h" #include "ioctl.h" #include "resize.h" +#include "refcounttree.h" #include <linux/ext2_fs.h> @@ -116,6 +117,8 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int status; struct ocfs2_space_resv sr; struct ocfs2_new_group_input input; + struct reflink_arguments args; + const char *old_path, *new_path; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -161,6 +164,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; return ocfs2_group_add(inode, &input); + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, (struct reflink_arguments *)arg, + sizeof(args))) + return -EFAULT; + old_path = (const char *)(unsigned long)args.old_path; + new_path = (const char *)(unsigned long)args.new_path; + + return ocfs2_reflink(inode, old_path, new_path); default: return -ENOTTY; } @@ -183,6 +194,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + case OCFS2_IOC_REFLINK: break; default: return -ENOIOCTLCMD; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index b3e7bfd..fa550b8 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -297,6 +297,14 @@ struct ocfs2_new_group_input { #define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) #define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) +/* Used to pass 2 file names to reflink. */ +struct reflink_arguments { + __u64 old_path; + __u64 new_path; +}; +#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) + + /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7b8ffca..ed9e449 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -33,6 +33,11 @@ #include "aops.h" #include "namei.h" +#include <linux/security.h> +#include <linux/quotaops.h> +#include <linux/namei.h> +#include <linux/mount.h> + struct ocfs2_cow_context { struct inode *inode; struct ocfs2_extent_tree di_et; @@ -1874,3 +1879,145 @@ out: return ret; } + +/* copied from may_create in VFS. */ +static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* Most codes are copied from vfs_link. */ +static int ocfs2_vfs_reflink(struct dentry *old_dentry, + struct inode *dir, + struct dentry *new_dentry) +{ + struct buffer_head *old_bh = NULL; + struct inode *inode = old_dentry->d_inode; + int error; + + if (!inode) + return -ENOENT; + + error = ocfs2_may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A link to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + if (!OCFS2_I(inode)->ip_clusters) { + mlog(ML_ERROR, "reflink doesn't work with 0 cluster files.\n"); + return -EINVAL; + } + + mutex_lock(&inode->i_mutex); + error = ocfs2_inode_lock(inode, &old_bh, 1); + if (error) { + mlog_errno(error); + goto out_unlock; + } + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + DQUOT_INIT(dir); + error = __ocfs2_reflink(old_dentry, old_bh, dir, new_dentry); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 1); + brelse(old_bh); +out_unlock: + mutex_unlock(&inode->i_mutex); + + return error; +} + +/* copied from user_path_parent. */ +static int ocfs2_user_path_parent(const char __user *path, + struct nameidata *nd, char **name) +{ + char *s = getname(path); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = path_lookup(s, LOOKUP_PARENT, nd); + if (error) + putname(s); + else + *name = s; + + return error; +} + +/* + * Most codes are copied from sys_linkat. + */ +int ocfs2_reflink(struct inode *inode, + const char __user *oldname, + const char __user *newname) +{ + struct dentry *new_dentry; + struct nameidata nd; + struct path old_path; + int error; + char *to = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = user_path_at(AT_FDCWD, oldname, 0, &old_path); + if (error) { + mlog_errno(error); + return error; + } + + error = ocfs2_user_path_parent(newname, &nd, &to); + if (error) { + mlog_errno(error); + goto out; + } + + error = -EXDEV; + if (old_path.mnt != nd.path.mnt) + goto out_release; + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) { + mlog_errno(error); + goto out_unlock; + } + + error = mnt_want_write(nd.path.mnt); + if (error) { + mlog_errno(error); + goto out_dput; + } + + error = ocfs2_vfs_reflink(old_path.dentry, + nd.path.dentry->d_inode, + new_dentry); + mnt_drop_write(nd.path.mnt); +out_dput: + dput(new_dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +out_release: + path_put(&nd.path); + putname(to); +out: + path_put(&old_path); + + return error; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index b01a50e..790f918 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -34,4 +34,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, struct ocfs2_alloc_context **meta_ac); int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, u32 cpos, u32 write_len); +int ocfs2_reflink(struct inode *inode, + const char __user *oldname, + const char __user *newname); #endif /* OCFS2_REFCOUNTTREE_H */ -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 21/42] ocfs2: Use proper parameter for some inode operation.
In order to make the original function more suitable for relink, we modify the following inode operations. Both are tiny. 1. ocfs2_mknod_locked only use dentry for mlog and it don't use dir and parent_fe_bh so remove these 3 parameters and move the mlog to its caller. 2. ocfs2_prepare_orphan_dir only want inode to get its ip_blkno. So use ip_blkno instead. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/namei.c | 42 +++++++++++++++++++----------------------- 1 files changed, 19 insertions(+), 23 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 7a309bd..d0baa05 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -67,18 +67,15 @@ #include "buffer_head_io.h" static int ocfs2_mknod_locked(struct ocfs2_super *osb, - struct inode *dir, struct inode *inode, - struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, handle_t *handle, struct ocfs2_alloc_context *inode_ac); static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, - struct inode *inode, + u64 blkno, char *name, struct buffer_head **de_bh); @@ -342,10 +339,13 @@ int ocfs2_mknod(struct inode *dir, struct dentry *dentry, } did_quota_inode = 1; + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, + inode->i_mode, (unsigned long)dev, dentry->d_name.len, + dentry->d_name.name); + /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, - &new_fe_bh, parent_fe_bh, handle, - inode_ac); + status = ocfs2_mknod_locked(osb, inode, dev, + &new_fe_bh, handle, inode_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -447,12 +447,9 @@ leave: } static int ocfs2_mknod_locked(struct ocfs2_super *osb, - struct inode *dir, struct inode *inode, - struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, handle_t *handle, struct ocfs2_alloc_context *inode_ac) { @@ -462,10 +459,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, u64 fe_blkno = 0; u16 suballoc_bit; - mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, - inode->i_mode, (unsigned long)dev, dentry->d_name.len, - dentry->d_name.name); - *new_fe_bh = NULL; status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, @@ -834,7 +827,8 @@ static int ocfs2_unlink(struct inode *dir, } if (inode_is_unlinkable(inode)) { - status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + OCFS2_I(inode)->ip_blkno, orphan_name, &orphan_entry_bh); if (status < 0) { @@ -1222,9 +1216,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, - new_inode, - orphan_name, - &orphan_entry_bh); + OCFS2_I(new_inode)->ip_blkno, + orphan_name, &orphan_entry_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1678,9 +1671,12 @@ static int ocfs2_symlink(struct inode *dir, } did_quota_inode = 1; - status = ocfs2_mknod_locked(osb, dir, inode, dentry, - 0, &new_fe_bh, parent_fe_bh, handle, - inode_ac); + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, + inode->i_mode, dentry->d_name.len, + dentry->d_name.name); + + status = ocfs2_mknod_locked(osb, inode, + 0, &new_fe_bh, handle, inode_ac); if (status < 0) { mlog_errno(status); goto bail; @@ -1828,7 +1824,7 @@ bail: static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, - struct inode *inode, + u64 blkno, char *name, struct buffer_head **de_bh) { @@ -1836,7 +1832,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh = NULL; int status = 0; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + status = ocfs2_blkno_stringify(blkno, name); if (status < 0) { mlog_errno(status); return status; -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 22/42] ocfs2: Create reflinked file in orphan dir.
reflink is a very complicated process, so it can't be integrated into one transaction. So if the system panic in the operation, we may leave a unfinished inode in the destication directory. This patch try to create an inode in orphan_dir first, reflink it to the src file and then move it to the destication file in the end. So we are not afraid of any corruption during the reflink. In the mean time, we never use ocfs2_mknod, so make it static again. Note: fsck.ocfs2 should work for us to remove the unfinished file in the orphan_dir. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/namei.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ocfs2/namei.h | 8 +- fs/ocfs2/refcounttree.c | 35 ++++--- 3 files changed, 274 insertions(+), 21 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d0baa05..488d44f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -214,8 +214,8 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) return inode; } -int ocfs2_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t dev) +static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t dev) { int status = 0; struct buffer_head *parent_fe_bh = NULL; @@ -2017,6 +2017,254 @@ leave: return status; } +int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, + struct inode **new_inode) +{ + int status, did_quota_inode = 0; + struct inode *inode = NULL; + struct inode *orphan_dir = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *fe = NULL; + handle_t *handle = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *orphan_entry_bh = NULL; + struct buffer_head *new_fe_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + + /* + * We give the orphan dir the root blkno to fake an orphan name, + * and allocate enough space for our insertion. + */ + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + osb->root_blkno, + orphan_name, + &orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* reserve an inode spot */ + status = ocfs2_reserve_new_inode(osb, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + /* We don't use standard VFS wrapper because we don't want vfs_dq_init + * to be called. */ + if (sb_any_quota_active(osb->sb) && + osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { + status = -EDQUOT; + goto leave; + } + did_quota_inode = 1; + + /* do the real work now. */ + status = ocfs2_mknod_locked(osb, inode, + 0, &new_fe_bh, handle, inode_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *)new_fe_bh->b_data; + status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, + orphan_entry_bh, orphan_dir); + if (status < 0) + mlog_errno(status); + +leave: + if (status < 0 && did_quota_inode) + vfs_dq_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + if (status == -ENOSPC) + mlog(0, "Disk is full\n"); + + if ((status < 0) && inode) { + clear_nlink(inode); + iput(inode); + } + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + brelse(new_fe_bh); + brelse(orphan_entry_bh); + + if (!status) + *new_inode = inode; + + return status; +} + +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + int status = 0; + struct buffer_head *parent_fe_bh = NULL; + handle_t *handle = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *dirfe, *fe; + struct buffer_head *de_bh = NULL; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *fe_bh = NULL; + + mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry, + dentry->d_name.len, dentry->d_name.name); + + status = ocfs2_inode_lock(dir, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!dirfe->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto leave; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mlog_errno(status); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + goto leave; + } + + status = ocfs2_read_inode_block(inode, &fe_bh); + if (status < 0) { + mlog_errno(status); + goto orphan_unlock; + } + + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto orphan_unlock; + } + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + fe = (struct ocfs2_dinode *)fe_bh->b_data; + le32_add_cpu(&fe->i_flags, -OCFS2_ORPHANED_FL); + fe->i_orphaned_slot = 0; + ocfs2_journal_dirty(handle, fe_bh); + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_fe_bh, + de_bh); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto out_commit; + } + + insert_inode_hash(inode); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); + status = 0; +out_commit: + ocfs2_commit_trans(osb, handle); +orphan_unlock: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); +leave: + + ocfs2_inode_unlock(dir, 1); + + brelse(de_bh); + brelse(parent_fe_bh); + brelse(orphan_dir_bh); + + mlog_exit(status); + + return status; +} + const struct inode_operations ocfs2_dir_iops = { .create = ocfs2_create, .lookup = ocfs2_lookup, diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 7f9cc46..40151c6 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -35,7 +35,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct inode *orphan_dir_inode, struct inode *inode, struct buffer_head *orphan_dir_bh); -int ocfs2_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t dev); - +int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, + struct inode **new_inode); +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *new_inode, + struct dentry *new_dentry); #endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ed9e449..cc85ab9 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1833,32 +1833,18 @@ out: static int __ocfs2_reflink(struct dentry *old_dentry, struct buffer_head *old_bh, - struct inode *dir, - struct dentry *dentry) + struct inode *new_inode) { int ret; struct inode *inode = old_dentry->d_inode; - struct inode *new_inode; struct buffer_head *new_bh = NULL; - mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, - old_dentry->d_name.len, old_dentry->d_name.name, - dentry->d_name.len, dentry->d_name.name); - - ret = ocfs2_mknod(dir, dentry, inode->i_mode, 0); - if (ret) { - mlog_errno(ret); - goto out; - } - ret = ocfs2_attach_refcount_tree(inode, old_bh); if (ret) { mlog_errno(ret); goto out; } - new_inode = dentry->d_inode; - mutex_lock(&new_inode->i_mutex); ret = ocfs2_inode_lock(new_inode, &new_bh, 1); if (ret) { @@ -1897,6 +1883,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, { struct buffer_head *old_bh = NULL; struct inode *inode = old_dentry->d_inode; + struct inode *new_orphan_inode; int error; if (!inode) @@ -1922,6 +1909,13 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, return -EINVAL; } + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + &new_orphan_inode); + if (error) { + mlog_errno(error); + goto out_unlock; + } + mutex_lock(&inode->i_mutex); error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { @@ -1931,14 +1925,23 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, down_write(&OCFS2_I(inode)->ip_alloc_sem); DQUOT_INIT(dir); - error = __ocfs2_reflink(old_dentry, old_bh, dir, new_dentry); + error = __ocfs2_reflink(old_dentry, old_bh, new_orphan_inode); up_write(&OCFS2_I(inode)->ip_alloc_sem); + if (error) + mlog_errno(error); ocfs2_inode_unlock(inode, 1); brelse(old_bh); out_unlock: mutex_unlock(&inode->i_mutex); + if (!error) { + error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + new_dentry); + if (error) + mlog_errno(error); + } + return error; } -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 23/42] ocfs2: Abstract caching info checkpoint.
In meta downconvert, we need to checkpoint the metadata in an inode. For refcount tree, we also need it. So abstract the process out. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/dlmglue.c | 18 +++++++++++++----- 1 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index fe76476..68cb674 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3342,11 +3342,11 @@ out: return UNBLOCK_CONTINUE; } -static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, - int new_level) +static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci, + struct ocfs2_lock_res *lockres, + int new_level) { - struct inode *inode = ocfs2_lock_res_inode(lockres); - int checkpointed = ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)); + int checkpointed = ocfs2_ci_fully_checkpointed(ci); BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); @@ -3354,10 +3354,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, if (checkpointed) return 1; - ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb)); + ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci))); return 0; } +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level); +} + static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) { struct inode *inode = ocfs2_lock_res_inode(lockres); -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 24/42] ocfs2: Add new refcount tree lock resource.
refcount tree lock resource is used to protect refcount tree read/write among multiple nodes. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/dlmglue.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/dlmglue.h | 4 +++ fs/ocfs2/ocfs2_lockid.h | 5 +++ fs/ocfs2/refcounttree.h | 14 +++++++++ 4 files changed, 91 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 68cb674..cc0a65e 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -53,6 +53,7 @@ #include "super.h" #include "uptodate.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -107,6 +108,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) /* This aids in debugging situations where a bad LVB might be involved. */ @@ -267,6 +273,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, }; +static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { + .check_downconvert = ocfs2_check_refcount_downconvert, + .downconvert_worker = ocfs2_refcount_convert_worker, + .flags = 0, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || @@ -648,6 +660,16 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, info); } +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno, + 0, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT, + &ocfs2_refcount_block_lops, osb); +} + void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { mlog_entry_void(); @@ -3495,6 +3517,24 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, return UNBLOCK_CONTINUE_POST; } +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct ocfs2_refcount_tree *tree = OCFS2_REF_ITEM(lockres); + + return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level); +} + +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_refcount_tree *tree = OCFS2_REF_ITEM(lockres); + + ocfs2_metadata_cache_purge(&tree->rf_ci); + + return UNBLOCK_CONTINUE; +} + static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) { struct ocfs2_qinfo_lvb *lvb; @@ -3606,6 +3646,34 @@ bail: return status; } +int ocfs2_refcount_lock(struct ocfs2_lock_res *lockres, int ex) +{ + int status; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_super *osb = lockres->l_priv; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_refcount_unlock(struct ocfs2_lock_res *lockres, int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_super *osb = lockres->l_priv; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + /* * This is the filesystem locking protocol. It provides the lock handling * hooks for the underlying DLM. It has a maximum version number. diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 3f8d998..20fe528 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -85,6 +85,8 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_mem_dqinfo; void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_mem_dqinfo *info); +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno); void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); @@ -121,6 +123,8 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock); void ocfs2_file_unlock(struct file *file); int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); +int ocfs2_refcount_lock(struct ocfs2_lock_res *lockres, int ex); +void ocfs2_refcount_unlock(struct ocfs2_lock_res *lockres, int ex); void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index eb6f50c..aafed05 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -47,6 +47,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, OCFS2_LOCK_TYPE_QINFO, + OCFS2_LOCK_TYPE_REFCOUNT, OCFS2_NUM_LOCK_TYPES }; @@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_QINFO: c = 'Q'; break; + case OCFS2_LOCK_TYPE_REFCOUNT: + c = 'T'; + break; default: c = '\0'; } @@ -100,6 +104,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 790f918..2c59c65 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -17,6 +17,20 @@ #ifndef OCFS2_REFCOUNTTREE_H #define OCFS2_REFCOUNTTREE_H +struct ocfs2_refcount_tree { + struct rb_node rf_node; + u64 rf_blkno; + struct rw_semaphore rf_sem; + struct ocfs2_lock_res rf_lockres; + struct ocfs2_caching_info rf_ci; +}; + +static inline struct ocfs2_refcount_tree * +OCFS2_REF_ITEM(struct ocfs2_lock_res *res) +{ + return container_of(res, struct ocfs2_refcount_tree, rf_lockres); +} + int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh); int ocfs2_set_refcount_tree(struct inode *inode, struct buffer_head *di_bh, -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 25/42] ocfs2: Add refcount tree lock mechanism.
Refcount tree lock is only related to a block number. So create a rb-tree for it. And the tree root is stored in ocfs2_super. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/ocfs2.h | 3 + fs/ocfs2/refcounttree.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 7 ++ fs/ocfs2/super.c | 5 ++ 4 files changed, 200 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 74db43f..366c91c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -372,6 +372,9 @@ struct ocfs2_super struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; + + /* rb tree root for refcount lock. */ + struct rb_root osb_rf_lock_tree; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index cc85ab9..f0ed496 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2024,3 +2024,188 @@ out: return error; } + +static struct ocfs2_refcount_tree * +ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) +{ + struct rb_node *n = osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tree = NULL; + + while (n) { + tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); + + if (blkno < tree->rf_blkno) + n = n->rb_left; + else if (blkno > tree->rf_blkno) + n = n->rb_right; + else + return tree; + } + + return NULL; +} + +/* osb_lock is already locked. */ +static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new) +{ + u64 rf_blkno = new->rf_blkno; + struct rb_node *parent = NULL; + struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tmp; + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_refcount_tree, + rf_node); + + if (rf_blkno < tmp->rf_blkno) + p = &(*p)->rb_left; + else if (rf_blkno > tmp->rf_blkno) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", + (unsigned long long)rf_blkno); + BUG(); + } + } + + rb_link_node(&new->rf_node, parent, p); + rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); +} + +static void ocfs2_free_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + ocfs2_simple_drop_lockres(osb, &tree->rf_lockres); + ocfs2_lock_res_free(&tree->rf_lockres); + kfree(tree); +} + +static void ocfs2_delete_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + spin_lock(&osb->osb_lock); + rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); + spin_unlock(&osb->osb_lock); + + ocfs2_free_refcount_tree(osb, tree); +} + +static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, + struct ocfs2_refcount_tree **ret_tree) +{ + int ret = 0; + struct ocfs2_refcount_tree *tree, *new = NULL; + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + spin_unlock(&osb->osb_lock); + + new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); + if (!new) { + ret = -ENOMEM; + return ret; + } + + new->rf_blkno = rf_blkno; + init_rwsem(&new->rf_sem); + ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, rf_blkno); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + ocfs2_insert_refcount_tree(osb, new); + + tree = new; + new = NULL; + +out: + if (new) + ocfs2_free_refcount_tree(osb, new); + + *ret_tree = tree; + + spin_unlock(&osb->osb_lock); + + return ret; +} + +/* + * Lock the refcount tree pointed by ref_blkno and return the tree. + * In most case, we lock the tree and read the refcount block. + * So read it here if the caller really need it. + */ +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret; + struct ocfs2_refcount_tree *tree = NULL; + + ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_refcount_lock(&tree->rf_lockres, rw); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rw) + down_write(&tree->rf_sem); + else + down_read(&tree->rf_sem); + + if (ref_bh) { + ret = ocfs2_read_refcount_block(&tree->rf_ci, + ref_blkno, ref_bh); + if (ret) { + mlog_errno(ret); + ocfs2_unlock_refcount_tree(osb, tree, rw); + goto out; + } + } + + *ret_tree = tree; +out: + return ret; +} + +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + ocfs2_refcount_unlock(&tree->rf_lockres, rw); + + if (rw) + up_write(&tree->rf_sem); + else + up_read(&tree->rf_sem); +} + +void ocfs2_purge_refcount_tree(struct ocfs2_super *osb) +{ + struct rb_node *node; + struct ocfs2_refcount_tree *tree; + struct rb_root *root = &osb->osb_rf_lock_tree; + + while ((node = rb_last(root)) != NULL) { + tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); + + mlog(0, "Purge tree %llu\n", + (unsigned long long) tree->rf_blkno); + + rb_erase(&tree->rf_node, root); + ocfs2_free_refcount_tree(osb, tree); + } +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 2c59c65..d108f4d 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -51,4 +51,11 @@ int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, int ocfs2_reflink(struct inode *inode, const char __user *oldname, const char __user *newname); +void ocfs2_purge_refcount_tree(struct ocfs2_super *osb); +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **tree, + struct buffer_head **ref_bh); +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, + int rw); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6497559..7dc7128 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -68,6 +68,7 @@ #include "ver.h" #include "xattr.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -1625,6 +1626,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_sync_blockdev(sb); + ocfs2_purge_refcount_tree(osb); + /* No cluster connection means we've failed during mount, so skip * all the steps which depended on that to complete. */ if (osb->cconn) { @@ -1817,6 +1820,8 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } + osb->osb_rf_lock_tree = RB_ROOT; + osb->s_feature_compat le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); osb->s_feature_ro_compat -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 26/42] ocfs2: lock refcount tree if needed.
Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 17 ++++++++++- fs/ocfs2/refcounttree.c | 71 +++++++++++++++++++++++++++++------------------ 2 files changed, 60 insertions(+), 28 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5a88705..a09bfb5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7496,6 +7496,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct ocfs2_path *path = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; mlog_entry_void(); @@ -7507,11 +7508,21 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, if (!path) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto free; } ocfs2_extent_map_trunc(inode, new_highest_cpos); + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + status = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, NULL); + if (status) { + mlog_errno(status); + goto free; + } + } + start: /* * Check that we still have allocation to delete. @@ -7636,6 +7647,9 @@ start: goto start; bail: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + if (!status && OCFS2_I(inode)->ip_clusters == 0) { /* remove the refcount tree. */ status = ocfs2_remove_refcount_tree(inode, fe_bh); @@ -7654,6 +7668,7 @@ bail: ocfs2_run_deallocs(osb, &tc->tc_dealloc); +free: ocfs2_free_path(path); /* This will drop the ext_alloc cluster lock for us */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index f0ed496..9b460db 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -52,6 +52,9 @@ struct ocfs2_cow_context { u32 cow_len; }; +static void ocfs2_delete_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree); + static int ocfs2_validate_refcount_block(struct super_block *sb, struct buffer_head *bh) { @@ -228,15 +231,16 @@ int ocfs2_set_refcount_tree(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *ref_bh = NULL; struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *ref_tree; BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); BUG_ON(di->i_refcount_loc); - ret = ocfs2_read_refcount_block(INODE_CACHE(inode), - refcount_loc, &ref_bh); + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, &ref_bh); if (ret) { mlog_errno(ret); - goto out; + return ret; } handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); @@ -271,9 +275,11 @@ int ocfs2_set_refcount_tree(struct inode *inode, di->i_refcount_loc = cpu_to_le64(refcount_loc); spin_unlock(&oi->ip_lock); ocfs2_journal_dirty(handle, di_bh); + out_commit: ocfs2_commit_trans(osb, handle); out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_bh); return ret; @@ -281,7 +287,7 @@ out: int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) { - int ret; + int ret, delete_tree = 0; handle_t *handle = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -290,19 +296,19 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) struct inode *alloc_inode = NULL; struct buffer_head *alloc_bh = NULL; struct buffer_head *blk_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; int credits = OCFS2_INODE_UPDATE_CREDITS + 1; - u64 blk = 0, bg_blkno = 0; + u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); u16 bit = 0; if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) return 0; - ret = ocfs2_read_refcount_block(INODE_CACHE(inode), - le64_to_cpu(di->i_refcount_loc), - &blk_bh); + BUG_ON(!ref_blkno); + ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); if (ret) { mlog_errno(ret); - goto out; + return ret; } rb = (struct ocfs2_refcount_block *)blk_bh->b_data; @@ -367,6 +373,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) ocfs2_journal_dirty(handle, blk_bh); if (!rb->rf_count) { + delete_tree = 1; ret = ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, bit, bg_blkno, 1); if (ret) @@ -386,6 +393,9 @@ out_mutex: iput(alloc_inode); } out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + if (delete_tree) + ocfs2_delete_refcount_tree(osb, ref_tree); brelse(blk_bh); return ret; @@ -1436,9 +1446,11 @@ int ocfs2_refcount_cow(struct inode *inode, int ret, num_recs = 0, has_data = 0, num_pages = 0; u32 cow_start = 0, cow_len = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_bh = NULL; struct page **pages = NULL; + struct ocfs2_refcount_tree *ref_tree; loff_t start, end; BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); @@ -1449,7 +1461,7 @@ int ocfs2_refcount_cow(struct inode *inode, &num_recs, &has_data); if (ret) { mlog_errno(ret); - goto out; + return ret; } mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " "cow_len %u, num_recs %d\n", inode->i_ino, @@ -1476,10 +1488,9 @@ int ocfs2_refcount_cow(struct inode *inode, } } - ret = ocfs2_read_refcount_block(INODE_CACHE(inode), - le64_to_cpu(di->i_refcount_loc), - &ref_bh); - if (ret < 0) { + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_bh); + if (ret) { mlog_errno(ret); goto out; } @@ -1490,12 +1501,13 @@ int ocfs2_refcount_cow(struct inode *inode, if (ret) mlog_errno(ret); + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_bh); out: if (pages) { ocfs2_unlock_and_free_pages(pages, num_pages); kfree(pages); } - brelse(ref_bh); return ret; } @@ -1560,6 +1572,8 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, struct buffer_head *ref_bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_tree *ref_tree; unsigned int ext_flags; loff_t size; u32 cpos, num_clusters, clusters, p_cluster; @@ -1578,9 +1592,9 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, } BUG_ON(!di->i_refcount_loc); - ret = ocfs2_read_refcount_block(INODE_CACHE(inode), - le64_to_cpu(di->i_refcount_loc), - &ref_bh); + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), 1, + &ref_tree, &ref_bh); if (ret) { mlog_errno(ret); goto out; @@ -1616,13 +1630,15 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, * record from the disk. */ ocfs2_extent_map_trunc(inode, 0); + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_bh); -out: if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { - ocfs2_schedule_truncate_log_flush(OCFS2_SB(inode->i_sb), 1); - ocfs2_run_deallocs(OCFS2_SB(inode->i_sb), &dealloc); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); } +out: return ret; } @@ -1756,6 +1772,7 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_refcount_tree *ref_tree; struct ocfs2_extent_list *el; struct ocfs2_extent_tree ref_et; @@ -1768,9 +1785,8 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, goto out; } - ret = ocfs2_read_refcount_block(INODE_CACHE(t_inode), - le64_to_cpu(di->i_refcount_loc), - &ref_bh); + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_bh); if (ret) { mlog_errno(ret); goto out; @@ -1799,7 +1815,7 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, meta_add, &meta_ac); if (ret) { mlog_errno(ret); - goto out; + goto out_unlock_refcount; } } @@ -1820,14 +1836,15 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, out_free_resource: if (meta_ac) ocfs2_free_alloc_context(meta_ac); +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_bh); out: if (ocfs2_dealloc_has_cluster(&dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); } - brelse(ref_bh); - return ret; } -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 27/42] ocfs2: Add caching info for refcount tree.
refcount tree should use its own caching info so that when we downconvert the refcount tree lock, we can drop all the cached buffer head. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 114 ++++++++++++++++++++++++++++++++++++++++++----- fs/ocfs2/refcounttree.h | 12 +++++ 2 files changed, 114 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9b460db..8efb0d2 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -52,8 +52,11 @@ struct ocfs2_cow_context { u32 cow_len; }; +static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops; static void ocfs2_delete_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree); +static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, + struct ocfs2_refcount_tree **ret_tree); static int ocfs2_validate_refcount_block(struct super_block *sb, struct buffer_head *bh) @@ -140,6 +143,7 @@ int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *tree; u16 slot, suballoc_bit_start; u32 num_got; u64 first_blkno; @@ -177,10 +181,16 @@ int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh) goto out_commit; } + ret = ocfs2_get_refcount_tree(osb, first_blkno, &tree); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + new_bh = sb_getblk(inode->i_sb, first_blkno); - ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + ocfs2_set_new_buffer_uptodate(&tree->rf_ci, new_bh); - ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), new_bh, + ret = ocfs2_journal_access_rb(handle, &tree->rf_ci, new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); @@ -257,7 +267,7 @@ int ocfs2_set_refcount_tree(struct inode *inode, goto out_commit; } - ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), ref_bh, + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -276,6 +286,7 @@ int ocfs2_set_refcount_tree(struct inode *inode, spin_unlock(&oi->ip_lock); ocfs2_journal_dirty(handle, di_bh); + ocfs2_set_ci_lock_trans(osb->journal, &ref_tree->rf_ci); out_commit: ocfs2_commit_trans(osb, handle); out: @@ -348,6 +359,8 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) goto out_unlock; } + ocfs2_set_ci_lock_trans(osb->journal, &ref_tree->rf_ci); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -355,7 +368,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) goto out_commit; } - ret = ocfs2_journal_access_rb(handle, INODE_CACHE(inode), blk_bh, + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -591,6 +604,9 @@ static int __ocfs2_increase_refcount(handle_t *handle, } out: + ocfs2_set_ci_lock_trans( + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci))->journal, + et->et_ci); ocfs2_free_path(path); return ret; } @@ -680,6 +696,7 @@ static int __ocfs2_decrease_refcount(handle_t *handle, } out: + ocfs2_set_ci_lock_trans(OCFS2_SB(sb)->journal, et->et_ci); ocfs2_free_path(path); return ret; } @@ -695,11 +712,19 @@ int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, struct buffer_head *ref_bh = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_extent_tree et; + struct ocfs2_refcount_tree *tree; BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); BUG_ON(!di->i_refcount_loc); - ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, le64_to_cpu(di->i_refcount_loc), &ref_bh); if (ret) { @@ -707,7 +732,7 @@ int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, goto out; } - ocfs2_init_refcount_extent_tree(&et, INODE_CACHE(inode), ref_bh); + ocfs2_init_refcount_extent_tree(&et, &tree->rf_ci, ref_bh); ret = __ocfs2_decrease_refcount(handle, &et, cpos, len, meta_ac, dealloc, delete); if (ret) @@ -773,6 +798,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, struct buffer_head *ref_bh = NULL; struct ocfs2_refcount_block *rb; struct ocfs2_extent_tree et; + struct ocfs2_refcount_tree *tree; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " @@ -785,7 +811,14 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); BUG_ON(!di->i_refcount_loc); - ret = ocfs2_read_refcount_block(INODE_CACHE(inode), + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, le64_to_cpu(di->i_refcount_loc), &ref_bh); if (ret < 0) { @@ -794,7 +827,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, } rb = (struct ocfs2_refcount_block *)ref_bh->b_data; - ocfs2_init_refcount_extent_tree(&et, INODE_CACHE(inode), ref_bh); + ocfs2_init_refcount_extent_tree(&et, &tree->rf_ci, ref_bh); ret = ocfs2_lock_allocators(inode, &et, 0, extents_split, NULL, meta_ac); @@ -1333,6 +1366,7 @@ static int ocfs2_make_clusters_writable(handle_t *handle, static int ocfs2_replace_cow(struct inode *inode, struct buffer_head *di_bh, struct buffer_head *ref_bh, + struct ocfs2_caching_info *ref_ci, u32 cow_start, u32 cow_len, int num_recs, struct page **pages, @@ -1371,8 +1405,7 @@ static int ocfs2_replace_cow(struct inode *inode, ocfs2_init_dealloc_ctxt(&context.dealloc); ocfs2_init_dinode_extent_tree(&context.di_et, INODE_CACHE(inode), di_bh); - ocfs2_init_refcount_extent_tree(&context.ref_et, - INODE_CACHE(inode), ref_bh); + ocfs2_init_refcount_extent_tree(&context.ref_et, ref_ci, ref_bh); ret = ocfs2_lock_refcount_cow_allocator(inode->i_sb, cow_len, &context, &credits); @@ -1495,7 +1528,7 @@ int ocfs2_refcount_cow(struct inode *inode, goto out; } - ret = ocfs2_replace_cow(inode, di_bh, ref_bh, + ret = ocfs2_replace_cow(inode, di_bh, ref_bh, &ref_tree->rf_ci, cow_start, cow_len, num_recs, pages, num_pages); if (ret) @@ -1791,7 +1824,7 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, mlog_errno(ret); goto out; } - ocfs2_init_refcount_extent_tree(&ref_et, INODE_CACHE(t_inode), ref_bh); + ocfs2_init_refcount_extent_tree(&ref_et, &ref_tree->rf_ci, ref_bh); rb = (struct ocfs2_refcount_block *)ref_bh->b_data; el = &di->id2.i_list; @@ -2096,6 +2129,7 @@ static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, static void ocfs2_free_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree) { + ocfs2_metadata_cache_exit(&tree->rf_ci); ocfs2_simple_drop_lockres(osb, &tree->rf_lockres); ocfs2_lock_res_free(&tree->rf_lockres); kfree(tree); @@ -2131,8 +2165,12 @@ static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, } new->rf_blkno = rf_blkno; + new->rf_sb = osb->sb; + spin_lock_init(&new->rf_lock); + mutex_init(&new->rf_io_mutex); init_rwsem(&new->rf_sem); ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, rf_blkno); + ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); spin_lock(&osb->osb_lock); tree = ocfs2_find_refcount_tree(osb, rf_blkno); @@ -2226,3 +2264,55 @@ void ocfs2_purge_refcount_tree(struct ocfs2_super *osb) ocfs2_free_refcount_tree(osb, tree); } } + +static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_blkno; +} + +static struct super_block * +ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_sb; +} + +static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_lock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_unlock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_lock(&rf->rf_io_mutex); +} + +static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_unlock(&rf->rf_io_mutex); +} + +static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { + .co_owner = ocfs2_refcount_cache_owner, + .co_get_super = ocfs2_refcount_cache_get_super, + .co_cache_lock = ocfs2_refcount_cache_lock, + .co_cache_unlock = ocfs2_refcount_cache_unlock, + .co_io_lock = ocfs2_refcount_cache_io_lock, + .co_io_unlock = ocfs2_refcount_cache_io_unlock, +}; diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index d108f4d..2930e29 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -22,7 +22,13 @@ struct ocfs2_refcount_tree { u64 rf_blkno; struct rw_semaphore rf_sem; struct ocfs2_lock_res rf_lockres; + + /* the following 4 fields are used by caching_info. */ struct ocfs2_caching_info rf_ci; + spinlock_t rf_lock; + struct mutex rf_io_mutex; + struct super_block *rf_sb; + }; static inline struct ocfs2_refcount_tree * @@ -31,6 +37,12 @@ OCFS2_REF_ITEM(struct ocfs2_lock_res *res) return container_of(res, struct ocfs2_refcount_tree, rf_lockres); } +static inline struct ocfs2_refcount_tree * +cache_info_to_refcount(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_refcount_tree, rf_ci); +} + int ocfs2_create_refcount_tree(struct inode *inode, struct buffer_head *di_bh); int ocfs2_set_refcount_tree(struct inode *inode, struct buffer_head *di_bh, -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 28/42] ocfs2: Add refcount tree find mechanism from an inode.
xattr need refcount tree support, but some functions don't have an easy access to the inode bh, so add a helper function which read inode_bh first and then get the refcount tree. And in order to speed up the refcount searching, we add a LRU to the ocfs2_super so that we can find a recently used refcount tree more quickly. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/refcounttree.c | 85 ++++++++++++++++++++++++++++++++++++++++------- fs/ocfs2/refcounttree.h | 1 - 3 files changed, 74 insertions(+), 13 deletions(-) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 366c91c..7fb60fe 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -375,6 +375,7 @@ struct ocfs2_super /* rb tree root for refcount lock. */ struct rb_root osb_rf_lock_tree; + struct ocfs2_refcount_tree *osb_ref_tree_lru; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8efb0d2..ce07e10 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2152,7 +2152,11 @@ static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, struct ocfs2_refcount_tree *tree, *new = NULL; spin_lock(&osb->osb_lock); - tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (osb->osb_ref_tree_lru && + osb->osb_ref_tree_lru->rf_blkno == rf_blkno) + tree = osb->osb_ref_tree_lru; + else + tree = ocfs2_find_refcount_tree(osb, rf_blkno); if (tree) goto out; @@ -2188,29 +2192,48 @@ out: *ret_tree = tree; + osb->osb_ref_tree_lru = tree; spin_unlock(&osb->osb_lock); return ret; } -/* - * Lock the refcount tree pointed by ref_blkno and return the tree. - * In most case, we lock the tree and read the refcount block. - * So read it here if the caller really need it. - */ -int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, - struct ocfs2_refcount_tree **ret_tree, - struct buffer_head **ref_bh) +/* Search refcount tree by inode. */ +static int ocfs2_get_refcount_tree_by_inode(struct inode *inode, + struct ocfs2_refcount_tree **ret_tree) { int ret; - struct ocfs2_refcount_tree *tree = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + /* We have never touch refcount tree for this inode. So let us do it. */ + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; } + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + di = (struct ocfs2_dinode *)di_bh->b_data; + ret = ocfs2_get_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + ret_tree); + if (ret) + mlog_errno(ret); + + brelse(di_bh); +out: + return ret; +} + +static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret; + ret = ocfs2_refcount_lock(&tree->rf_lockres, rw); if (ret) { mlog_errno(ret); @@ -2224,7 +2247,7 @@ int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, if (ref_bh) { ret = ocfs2_read_refcount_block(&tree->rf_ci, - ref_blkno, ref_bh); + tree->rf_blkno, ref_bh); if (ret) { mlog_errno(ret); ocfs2_unlock_refcount_tree(osb, tree, rw); @@ -2237,6 +2260,44 @@ out: return ret; } +/* + * Lock the refcount tree pointed by ref_blkno and return the tree. + * In most case, we lock the tree and read the refcount block. + * So read it here if the caller really need it. + */ +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret; + struct ocfs2_refcount_tree *tree = NULL; + + ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + return ret; + } + + return __ocfs2_lock_refcount_tree(osb, tree, rw, ret_tree, ref_bh); +} + +int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret; + struct ocfs2_refcount_tree *tree = NULL; + + ret = ocfs2_get_refcount_tree_by_inode(inode, &tree); + if (ret) { + mlog_errno(ret); + return ret; + } + + return __ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), tree, + rw, ret_tree, ref_bh); +} + void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 2930e29..5c3faaf 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -28,7 +28,6 @@ struct ocfs2_refcount_tree { spinlock_t rf_lock; struct mutex rf_io_mutex; struct super_block *rf_sb; - }; static inline struct ocfs2_refcount_tree * -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 29/42] ocfs2: Return extent flags for xattr value tree.
With the new refcount tree, xattr value can also be refcounted among multiple files. So return the appropriate extent flags so that CoW can used it later. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/extent_map.c | 6 +++++- fs/ocfs2/extent_map.h | 3 ++- fs/ocfs2/xattr.c | 7 ++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 40b5105..843db64 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -541,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb, int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, - struct ocfs2_extent_list *el) + struct ocfs2_extent_list *el, + unsigned int *extent_flags) { int ret = 0, i; struct buffer_head *eb_bh = NULL; @@ -593,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *p_cluster = *p_cluster + coff; if (num_clusters) *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + + if (extent_flags) + *extent_flags = rec->e_flags; } out: if (eb_bh) diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 9942f47..e79d41c 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -55,7 +55,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, - struct ocfs2_extent_list *el); + struct ocfs2_extent_list *el, + unsigned int *extent_flags); int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, struct buffer_head *bhs[], int flags, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 291ca13..70ad01c 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -699,7 +699,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, while (trunc_len) { ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, &alloc_size, - &vb->vb_xv->xr_list); + &vb->vb_xv->xr_list, NULL); if (ret) { mlog_errno(ret); goto out; @@ -954,7 +954,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode, cpos = 0; while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, - &num_clusters, el); + &num_clusters, el, NULL); if (ret) { mlog_errno(ret); goto out; @@ -1192,7 +1192,8 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, - &num_clusters, &xv->xr_list); + &num_clusters, &xv->xr_list, + NULL); if (ret) { mlog_errno(ret); goto out; -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 30/42] ocfs2: Abstract duplicate clusters process in CoW.
We currently use pagecache to duplicate clusters in CoW, but it isn't suitable for xattr case. So abstract it out so that the caller can decide which method it use. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 156 ++++++++++++++++++++++++++--------------------- 1 files changed, 87 insertions(+), 69 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ce07e10..bf03d21 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -40,7 +40,7 @@ struct ocfs2_cow_context { struct inode *inode; - struct ocfs2_extent_tree di_et; + struct ocfs2_extent_tree data_et; struct ocfs2_extent_tree ref_et; struct ocfs2_alloc_context *meta_ac; struct ocfs2_alloc_context *data_ac; @@ -50,6 +50,14 @@ struct ocfs2_cow_context { int num_pages; u32 cow_start; u32 cow_len; + int (*get_clusters)(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags); + int (*cow_duplicate_clusters)(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); }; static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops; @@ -855,7 +863,7 @@ out: * use that value as the maximum clusters. */ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_list *el, u32 cpos, u32 write_len, u32 *cow_start, @@ -864,8 +872,6 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, int *has_data) { int ret = 0; - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - struct ocfs2_extent_list *el = &di->id2.i_list; int tree_height = le16_to_cpu(el->l_tree_depth), i; struct buffer_head *eb_bh = NULL; struct ocfs2_extent_block *eb = NULL; @@ -998,7 +1004,7 @@ static int ocfs2_lock_refcount_cow_allocator(struct super_block *sb, *credits = 0; - ret = ocfs2_calc_tree_change_need(sb, &context->di_et, cow_len + 2, + ret = ocfs2_calc_tree_change_need(sb, &context->data_et, cow_len + 2, &meta_add, credits); if (ret) { mlog_errno(ret); @@ -1104,13 +1110,13 @@ out: return ret; } -static int ocfs2_duplicate_clusters(handle_t *handle, - struct ocfs2_cow_context *context, - u32 cpos, u32 old_cluster, - u32 new_cluster, u32 new_len) +static int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) { int ret = 0, bh_num; - struct ocfs2_caching_info *ci = context->di_et.et_ci; + struct ocfs2_caching_info *ci = context->data_et.et_ci; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); int i, j, bpc = ocfs2_clusters_to_blocks(sb, 1); u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); @@ -1243,7 +1249,7 @@ static int ocfs2_replace_clusters(handle_t *handle, unsigned int ext_flags) { int ret; - struct ocfs2_caching_info *ci = context->di_et.et_ci; + struct ocfs2_caching_info *ci = context->data_et.et_ci; u64 ino = ocfs2_metadata_cache_owner(ci); mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", @@ -1251,15 +1257,15 @@ static int ocfs2_replace_clusters(handle_t *handle, /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { - ret = ocfs2_duplicate_clusters(handle, context, cpos, - old, new, len); + ret = context->cow_duplicate_clusters(handle, context, cpos, + old, new, len); if (ret) { mlog_errno(ret); goto out; } } - ret = ocfs2_clear_ext_refcount(handle, &context->di_et, + ret = ocfs2_clear_ext_refcount(handle, &context->data_et, cpos, new, len, ext_flags, context->meta_ac, &context->dealloc); out: @@ -1274,7 +1280,7 @@ static int ocfs2_make_clusters_writable(handle_t *handle, int ret, delete, index; u32 new_bit, new_len, r_end; unsigned int set_len; - struct ocfs2_caching_info *inode_ci = context->di_et.et_ci; + struct ocfs2_caching_info *inode_ci = context->data_et.et_ci; struct ocfs2_super *osb OCFS2_SB(ocfs2_metadata_cache_get_super(inode_ci)); struct ocfs2_path *path; @@ -1313,7 +1319,8 @@ static int ocfs2_make_clusters_writable(handle_t *handle, */ if (le32_to_cpu(rec.e_refcount) == 1) { delete = 0; - ret = ocfs2_clear_ext_refcount(handle, &context->di_et, + ret = ocfs2_clear_ext_refcount(handle, + &context->data_et, cpos, p_cluster, set_len, e_flags, context->meta_ac, @@ -1363,21 +1370,27 @@ static int ocfs2_make_clusters_writable(handle_t *handle, return ret; } -static int ocfs2_replace_cow(struct inode *inode, - struct buffer_head *di_bh, - struct buffer_head *ref_bh, - struct ocfs2_caching_info *ref_ci, - u32 cow_start, u32 cow_len, - int num_recs, - struct page **pages, - int num_pages) +static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + struct inode *inode = context->inode; + + return ocfs2_get_clusters(inode, v_cluster, p_cluster, + num_clusters, extent_flags); +} + +static int ocfs2_replace_cow(struct ocfs2_cow_context *context, + int num_recs) { int ret, credits; - u32 p_cluster, num_clusters, start = cow_start; + struct inode *inode = context->inode; + u32 cow_start = context->cow_start, cow_len = context->cow_len; + u32 p_cluster, num_clusters; unsigned int ext_flags; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_cow_context context; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " @@ -1386,36 +1399,17 @@ static int ocfs2_replace_cow(struct inode *inode, return -EROFS; } - memset(&context, 0, sizeof(context)); - - context.inode = inode; - context.cow_pages = pages; - context.num_pages = num_pages; - context.cow_start = cow_start; - context.cow_len = cow_len; - - context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1), - sizeof(struct buffer_head *), GFP_NOFS); - if (!context.bhs) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - - ocfs2_init_dealloc_ctxt(&context.dealloc); - ocfs2_init_dinode_extent_tree(&context.di_et, - INODE_CACHE(inode), di_bh); - ocfs2_init_refcount_extent_tree(&context.ref_et, ref_ci, ref_bh); + ocfs2_init_dealloc_ctxt(&context->dealloc); ret = ocfs2_lock_refcount_cow_allocator(inode->i_sb, cow_len, - &context, &credits); + context, &credits); if (ret) { mlog_errno(ret); return ret; } /* - * We also need the credits for removing extents for both di-tree and + * We also need the credits for removing extents for both data-tree and * refcount tree and the copy of data. */ credits += ocfs2_remove_extent_credits(inode->i_sb) * num_recs * 2; @@ -1428,15 +1422,15 @@ static int ocfs2_replace_cow(struct inode *inode, } while (cow_len) { - ret = ocfs2_get_clusters(inode, cow_start, &p_cluster, - &num_clusters, &ext_flags); + ret = context->get_clusters(context, cow_start, &p_cluster, + &num_clusters, &ext_flags); BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); if (cow_len < num_clusters) num_clusters = cow_len; - ret = ocfs2_make_clusters_writable(handle, &context, cow_start, + ret = ocfs2_make_clusters_writable(handle, context, cow_start, p_cluster, num_clusters, ext_flags); if (ret) { @@ -1448,27 +1442,19 @@ static int ocfs2_replace_cow(struct inode *inode, cow_start += num_clusters; } - - /* - * truncate the extent map here since no matter whether we meet with - * any error during the action, we shouldn't trust cached extent map - * any more. - */ - ocfs2_extent_map_trunc(inode, start); ocfs2_commit_trans(osb, handle); out: - if (context.data_ac) - ocfs2_free_alloc_context(context.data_ac); - if (context.meta_ac) - ocfs2_free_alloc_context(context.meta_ac); + if (context->data_ac) + ocfs2_free_alloc_context(context->data_ac); + if (context->meta_ac) + ocfs2_free_alloc_context(context->meta_ac); - if (ocfs2_dealloc_has_cluster(&context.dealloc)) { + if (ocfs2_dealloc_has_cluster(&context->dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); - ocfs2_run_deallocs(osb, &context.dealloc); + ocfs2_run_deallocs(osb, &context->dealloc); } - kfree(context.bhs); return ret; } @@ -1485,11 +1471,15 @@ int ocfs2_refcount_cow(struct inode *inode, struct page **pages = NULL; struct ocfs2_refcount_tree *ref_tree; loff_t start, end; + struct ocfs2_cow_context context; + + memset(&context, 0, sizeof(context)); BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); BUG_ON(!di->i_refcount_loc); - ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len, + ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, + cpos, write_len, &cow_start, &cow_len, &num_recs, &has_data); if (ret) { @@ -1528,12 +1518,40 @@ int ocfs2_refcount_cow(struct inode *inode, goto out; } - ret = ocfs2_replace_cow(inode, di_bh, ref_bh, &ref_tree->rf_ci, - cow_start, cow_len, num_recs, - pages, num_pages); + context.inode = inode; + context.cow_start = cow_start; + context.cow_len = cow_len; + context.cow_pages = pages; + context.num_pages = num_pages; + context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1), + sizeof(struct buffer_head *), GFP_NOFS); + if (!context.bhs) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_unlock; + } + + context.cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; + context.get_clusters = ocfs2_di_get_clusters; + + ocfs2_init_dinode_extent_tree(&context.data_et, + INODE_CACHE(inode), di_bh); + ocfs2_init_refcount_extent_tree(&context.ref_et, + &ref_tree->rf_ci, ref_bh); + + ret = ocfs2_replace_cow(&context, num_recs); if (ret) mlog_errno(ret); + /* + * truncate the extent map here since no matter whether we meet with + * any error during the action, we shouldn't trust cached extent map + * any more. + */ + ocfs2_extent_map_trunc(inode, cow_start); + + kfree(context.bhs); +out_unlock: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_bh); out: -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 31/42] ocfs2: Use xs->bucket to set xattr value outside.
A long time ago, xs->base is allocated a 4K size and all the contents in the bucket are copied to the it. Now we use ocfs2_xattr_bucket to abstract xattr bucket and xs->base is initialized to the start of the bu_bhs[0]. So xs->base + offset will overflow when the value root is stored outside the first block. Then why we can survive the xattr test by now? It is because we always read the bucket contiguously now and kernel mm allocate continguous memory for us. We are lucky, but we should fix it. So just get the right value root as other callers do. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/xattr.c | 26 ++++++++++++++++++++------ 1 files changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 70ad01c..0d81862 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4795,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, char *val, int value_len) { - int offset; + int ret, offset, block_off; struct ocfs2_xattr_value_root *xv; struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); + void *base; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - offset = le16_to_cpu(xe->xe_name_offset) + - OCFS2_XATTR_SIZE(xe->xe_name_len); + ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + xe - xh->xh_entries, + &block_off, + &offset); + if (ret) { + mlog_errno(ret); + goto out; + } - xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + base = bucket_block(xs->bucket, block_off); + xv = (struct ocfs2_xattr_value_root *)(base + offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); - return __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + ret = __ocfs2_xattr_set_value_outside(inode, handle, + xv, val, value_len); + if (ret) + mlog_errno(ret); +out: + return ret; } static int ocfs2_rm_xattr_cluster(struct inode *inode, -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 32/42] ocfs2: Add CoW support for xattr.
In order to make 2 transcation(xattr and cow) independent with each other, we CoW the whole xattr out in case we are setting them. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/refcounttree.c | 160 +++++++++++++++++++++++++++-- fs/ocfs2/refcounttree.h | 26 +++++- fs/ocfs2/xattr.c | 264 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 429 insertions(+), 23 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a09bfb5..d2a058a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7052,7 +7052,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, if (delete_blk) { if (rec_flags & OCFS2_EXT_REFCOUNTED) - status = ocfs2_decrease_refcount(inode, fe_bh, handle, + status = ocfs2_decrease_refcount(inode, handle, ocfs2_blocks_to_clusters(osb->sb, delete_blk), clusters_to_del, meta_ac, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index bf03d21..7eaab1d 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -32,6 +32,7 @@ #include "extent_map.h" #include "aops.h" #include "namei.h" +#include "xattr.h" #include <linux/security.h> #include <linux/quotaops.h> @@ -48,6 +49,7 @@ struct ocfs2_cow_context { struct buffer_head **bhs; struct page **cow_pages; int num_pages; + struct ocfs2_xattr_value_root *cow_xv; u32 cow_start; u32 cow_len; int (*get_clusters)(struct ocfs2_cow_context *context, @@ -65,6 +67,8 @@ static void ocfs2_delete_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree); static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, struct ocfs2_refcount_tree **ret_tree); +static int ocfs2_get_refcount_tree_by_inode(struct inode *inode, + struct ocfs2_refcount_tree **ret_tree); static int ocfs2_validate_refcount_block(struct super_block *sb, struct buffer_head *bh) @@ -709,7 +713,7 @@ out: return ret; } -int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_decrease_refcount(struct inode *inode, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc, @@ -718,22 +722,18 @@ int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_bh = NULL; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_extent_tree et; struct ocfs2_refcount_tree *tree; BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); - BUG_ON(!di->i_refcount_loc); - ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_refcount_loc), &tree); + ret = ocfs2_get_refcount_tree_by_inode(inode, &tree); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_read_refcount_block(&tree->rf_ci, - le64_to_cpu(di->i_refcount_loc), + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, &ref_bh); if (ret) { mlog_errno(ret); @@ -1190,6 +1190,65 @@ out: return ret; } +static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0; + struct super_block *sb = context->inode->i_sb; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); + u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *old_bh = NULL; + struct buffer_head *new_bh = NULL; + + mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, + new_cluster, new_len); + + for (i = 0; i < blocks; i++, old_block++, new_block++) { + new_bh = sb_getblk(osb->sb, new_block); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_journal_access(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret) { + mlog_errno(ret); + break; + } + + brelse(new_bh); + brelse(old_bh); + new_bh = NULL; + old_bh = NULL; + } + + brelse(new_bh); + brelse(old_bh); + return ret; +} + static int ocfs2_clear_ext_refcount(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u32 p_cluster, u32 len, @@ -1382,7 +1441,7 @@ static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, } static int ocfs2_replace_cow(struct ocfs2_cow_context *context, - int num_recs) + int num_recs, struct ocfs2_post_refcount *post) { int ret, credits; struct inode *inode = context->inode; @@ -1414,6 +1473,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context, */ credits += ocfs2_remove_extent_credits(inode->i_sb) * num_recs * 2; + if (post) + credits += post->credits; handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1442,6 +1503,13 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context, cow_start += num_clusters; } + /* handle some pos_cow case. */ + if (post && post->func) { + ret = post->func(inode, handle, post->para); + if (ret) + mlog_errno(ret); + } + ocfs2_commit_trans(osb, handle); out: @@ -1539,7 +1607,7 @@ int ocfs2_refcount_cow(struct inode *inode, ocfs2_init_refcount_extent_tree(&context.ref_et, &ref_tree->rf_ci, ref_bh); - ret = ocfs2_replace_cow(&context, num_recs); + ret = ocfs2_replace_cow(&context, num_recs, NULL); if (ret) mlog_errno(ret); @@ -1562,6 +1630,80 @@ out: return ret; } +static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + struct inode *inode = context->inode; + struct ocfs2_xattr_value_root *xv = context->cow_xv; + + return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, + num_clusters, &xv->xr_list, + extent_flags); +} + +/* + * Do CoW for xattr. + */ +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post) +{ + int ret, num_recs, has_data; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_cow_context context; + u32 cow_start, cow_len; + + memset(&context, 0, sizeof(context)); + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, + cpos, write_len, + &cow_start, &cow_len, + &num_recs, &has_data); + if (ret) { + mlog_errno(ret); + return ret; + } + + BUG_ON(cow_len == 0 || num_recs == 0 || has_data == 0); + + context.inode = inode; + context.cow_start = cow_start; + context.cow_len = cow_len; + context.cow_xv = xv; + context.bhs = kcalloc(ocfs2_clusters_to_blocks(inode->i_sb, 1), + sizeof(struct buffer_head *), GFP_NOFS); + if (!context.bhs) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + context.cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; + context.get_clusters = ocfs2_xattr_value_get_clusters; + + ocfs2_init_xattr_value_extent_tree(&context.data_et, + INODE_CACHE(inode), vb); + ocfs2_init_refcount_extent_tree(&context.ref_et, + &ref_tree->rf_ci, ref_bh); + + ret = ocfs2_replace_cow(&context, num_recs, post); + if (ret) + mlog_errno(ret); + +out: + kfree(context.bhs); + return ret; +} + /* * Insert a new extent into refcount tree and mark a extent rec * as refcounted in the dinode tree. diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 5c3faaf..32ae584 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -48,7 +48,7 @@ int ocfs2_set_refcount_tree(struct inode *inode, u64 blkno); int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); -int ocfs2_decrease_refcount(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_decrease_refcount(struct inode *inode, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc, @@ -69,4 +69,28 @@ int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw); + +typedef int (ocfs2_post_refcount_func)(struct inode *inode, + handle_t *handle, + void *para); +/* + * Some refcount caller need to do more work after we modify the data b-tree + * during refcount operation(including CoW and add refcount flag), and make the + * transaction complete. So it must give us this structure so that we can do it + * within our transaction. + * + */ +struct ocfs2_post_refcount { + int credits; /* credits it need for journal. */ + ocfs2_post_refcount_func *func; /* real function. */ + void *para; +}; + +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 0d81862..15d89a7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -55,7 +55,7 @@ #include "buffer_head_io.h" #include "super.h" #include "xattr.h" - +#include "refcounttree.h" struct ocfs2_xattr_def_value_root { struct ocfs2_xattr_value_root xv; @@ -175,6 +175,14 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, u64 src_blk, u64 last_blk, u64 to_blk, unsigned int start_bucket, u32 *first_hash); +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_need, + int *credits); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -642,6 +650,7 @@ leave: static int __ocfs2_remove_xattr_range(struct inode *inode, struct ocfs2_xattr_value_buf *vb, u32 cpos, u32 phys_cpos, u32 len, + unsigned int ext_flags, struct ocfs2_xattr_set_ctxt *ctxt) { int ret; @@ -673,7 +682,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode, goto out; } - ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(inode->i_sb, + phys_blkno), + len, ctxt->meta_ac, &ctxt->dealloc, 1); + else + ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, + phys_blkno, len); if (ret) mlog_errno(ret); @@ -688,6 +704,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { int ret = 0; + unsigned int ext_flags; u32 trunc_len, cpos, phys_cpos, alloc_size; u64 block; @@ -699,7 +716,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, while (trunc_len) { ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, &alloc_size, - &vb->vb_xv->xr_list, NULL); + &vb->vb_xv->xr_list, &ext_flags); if (ret) { mlog_errno(ret); goto out; @@ -710,7 +727,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, ret = __ocfs2_remove_xattr_range(inode, vb, cpos, phys_cpos, alloc_size, - ctxt); + ext_flags, ctxt); if (ret) { mlog_errno(ret); goto out; @@ -1176,7 +1193,7 @@ static int ocfs2_xattr_get(struct inode *inode, static int __ocfs2_xattr_set_value_outside(struct inode *inode, handle_t *handle, - struct ocfs2_xattr_value_root *xv, + struct ocfs2_xattr_value_buf *vb, const void *value, int value_len) { @@ -1187,18 +1204,22 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); u64 blkno; struct buffer_head *bh = NULL; + unsigned int ext_flags; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, &xv->xr_list, - NULL); + &ext_flags); if (ret) { mlog_errno(ret); goto out; } + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); for (i = 0; i < num_clusters * bpc; i++, blkno++) { @@ -1350,7 +1371,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, mlog_errno(ret); return ret; } - ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, + ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, xi->value, xi->value_len); if (ret < 0) mlog_errno(ret); @@ -1589,7 +1610,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, ret = __ocfs2_xattr_set_value_outside(inode, handle, - vb.vb_xv, + &vb, xi->value, xi->value_len); if (ret < 0) @@ -2425,6 +2446,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, struct ocfs2_xattr_search *xis, struct ocfs2_xattr_search *xbs, struct ocfs2_xattr_set_ctxt *ctxt, + int extra_meta, int *credits) { int clusters_add, meta_add, ret; @@ -2441,6 +2463,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, return ret; } + meta_add += extra_meta; mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " "credits = %d\n", xi->name, meta_add, clusters_add, *credits); @@ -2708,10 +2731,11 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits; + int ret, credits, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + struct ocfs2_refcount_tree *ref_tree = NULL; struct ocfs2_xattr_info xi = { .name_index = name_index, @@ -2776,6 +2800,17 @@ int ocfs2_xattr_set(struct inode *inode, goto cleanup; } + /* Check whether the value is refcounted and do some prepartion. */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + (!xis.not_found || !xbs.not_found)) { + ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, + &xis, &xbs, &ref_tree, + &ref_meta, &ref_credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } mutex_lock(&tl_inode->i_mutex); @@ -2790,7 +2825,7 @@ int ocfs2_xattr_set(struct inode *inode, mutex_unlock(&tl_inode->i_mutex); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, - &xbs, &ctxt, &credits); + &xbs, &ctxt, ref_meta, &credits); if (ret) { mlog_errno(ret); goto cleanup; @@ -2798,7 +2833,7 @@ int ocfs2_xattr_set(struct inode *inode, /* we need to update inode's ctime field, so add credit for it. */ credits += OCFS2_INODE_UPDATE_CREDITS; - ctxt.handle = ocfs2_start_trans(osb, credits); + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); @@ -2817,6 +2852,8 @@ int ocfs2_xattr_set(struct inode *inode, ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); cleanup: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); cleanup_nolock: @@ -4800,6 +4837,9 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, struct ocfs2_xattr_entry *xe = xs->here; struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); void *base; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); @@ -4816,8 +4856,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, xv = (struct ocfs2_xattr_value_root *)(base + offset + OCFS2_XATTR_SIZE(xe->xe_name_len)); + vb.vb_xv = xv; + vb.vb_bh = xs->bucket->bu_bhs[block_off]; ret = __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + &vb, val, value_len); if (ret) mlog_errno(ret); out: @@ -5308,6 +5350,204 @@ out: } /* + * Whenever we modify a xattr value root in the bucket(e.g, CoW + * or change the extent record flag), we need to recalculate + * the metaecc for the whole bucket. So it is done here. + * + * Note: + * We have to give the extra credits for the caller. + */ +static int ocfs2_xattr_bucket_post_refcount(struct inode *inode, + handle_t *handle, + void *para) +{ + int ret; + struct ocfs2_xattr_bucket *bucket + (struct ocfs2_xattr_bucket *)para; + + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); + + return 0; +} + +/* + * Given a xattr value, calculate the most meta/credits we need + * if we truncat it to 0. + */ +static int ocfs2_refcounted_xattr_delete_need(struct ocfs2_super *osb, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_xattr_value_root *xv, + int *meta_add, + int *credits) +{ + int ret = 0, num_free_extents = ocfs2_num_free_extents(osb, ref_et); + + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < le32_to_cpu(xv->xr_clusters) * 2) { + *meta_add = ocfs2_extend_meta_needed(ref_et->et_root_el); + *credits = ocfs2_calc_extend_credits(osb->sb, + ref_et->et_root_el, + le32_to_cpu(xv->xr_clusters)); + } + +out: + return ret; +} + +/* + * Special action we need if the xattr value is refcounted. + * + * 1. If the xattr is refcounted, lock the tree. + * 2. CoW the xattr if we are setting the new value and the value + * will be stored outside. + * 3. In other case, decrease_refcount will work for us, so just + * lock the refcount tree, calculate the meta and credits is OK. + * + * We have to do CoW before ocfs2_init_xattr_set_ctxt since + * currently CoW is a completed transaction, while this function + * will also lock the allocators and let us deadlock. So we will + * CoW the whole xattr value. + */ +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_add, + int *credits) +{ + int ret = 0; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_entry *xe; + char *base; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + int name_offset, name_len; + struct ocfs2_xattr_value_buf vb; + struct ocfs2_xattr_bucket *bucket = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_post_refcount refcount; + struct ocfs2_post_refcount *p = NULL; + struct buffer_head *ref_bh = NULL; + + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + vb.vb_bh = xis->inode_bh; + vb.vb_access = ocfs2_journal_access_di; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + base = bucket_block(xbs->bucket, block_off); + vb.vb_bh = xbs->bucket->bu_bhs[block_off]; + vb.vb_access = ocfs2_journal_access; + + if (ocfs2_meta_ecc(osb)) { + /*create parameters for ocfs2_post_refcount. */ + bucket = xbs->bucket; + refcount.credits = bucket->bu_blocks; + refcount.para = bucket; + refcount.func + ocfs2_xattr_bucket_post_refcount; + p = &refcount; + } + } else { + base = xbs->base; + vb.vb_bh = xbs->xattr_bh; + vb.vb_access = ocfs2_journal_access_xb; + } + } + + if (ocfs2_xattr_is_local(xe)) + goto out; + + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, &vb.vb_xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We just need to check the 1st extent record, since we always + * CoW the whole xattr. So there shouldn't be a xattr with + * some REFCOUNT extent recs after the 1st one. + */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, ref_tree, &ref_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * If we are deleting the xattr or the new size will be stored inside, + * cool, leave it there, the xattr truncate process will remove them + * for us(it still needs the refcount tree lock and the meta, credits). + * And the worse case is that every cluster truncate will split the + * refcount tree, and make the original extent become 3. So we will need + * 2 * cluster more extent recs at most. + */ + if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, + &(*ref_tree)->rf_ci, ref_bh); + + ret = ocfs2_refcounted_xattr_delete_need(osb, &et, vb.vb_xv, + meta_add, credits); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_refcount_cow_xattr(inode, di, &vb, *ref_tree, ref_bh, 0, + le32_to_cpu(vb.vb_xv->xr_clusters), p); + if (ret) + mlog_errno(ret); + +out: + brelse(ref_bh); + return ret; +} + +/* * 'security' attributes support */ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 33/42] ocfs2: Remove inode from ocfs2_xattr_bucket_get_name_value.
In ocfs2_xattr_bucket_get_name_value, actually we only use super_block. So use it. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/xattr.c | 20 ++++++++++---------- 1 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 15d89a7..386fa3a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -139,7 +139,7 @@ struct ocfs2_xattr_search { int not_found; }; -static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, int *block_off, @@ -1095,7 +1095,7 @@ static int ocfs2_xattr_block_get(struct inode *inode, i = xs->here - xs->header->xh_entries; if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(xs->bucket), i, &block_off, @@ -2291,7 +2291,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, old_in_xb = 1; if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(xbs->bucket), i, &block_off, &name_offset); @@ -2966,7 +2966,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode, if (cmp) continue; - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh, i, &block_off, @@ -3210,7 +3210,7 @@ struct ocfs2_xattr_tree_list { size_t result; }; -static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, int *block_off, @@ -3223,8 +3223,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); - *block_off = name_offset >> inode->i_sb->s_blocksize_bits; - *new_offset = name_offset % inode->i_sb->s_blocksize; + *block_off = name_offset >> sb->s_blocksize_bits; + *new_offset = name_offset % sb->s_blocksize; return 0; } @@ -3244,7 +3244,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, prefix = ocfs2_xattr_prefix(type); if (prefix) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(bucket), i, &block_off, @@ -4843,7 +4843,7 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh, xe - xh->xh_entries, &block_off, &offset); @@ -5459,7 +5459,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode, i = xbs->here - xbs->header->xh_entries; if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(xbs->bucket), i, &block_off, &name_offset); -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 34/42] ocfs2: Abstract the creation of xattr block.
In xattr reflink, we also need to create xattr block, so abstract the process out. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/xattr.c | 115 +++++++++++++++++++++++++++++++++--------------------- 1 files changed, 70 insertions(+), 45 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 386fa3a..e3d31f5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2099,6 +2099,72 @@ cleanup: return ret; } +static int ocfs2_create_xattr_block(handle_t *handle, + struct inode *inode, + struct buffer_head *inode_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_bh) +{ + int ret; + u16 slot, suballoc_bit_start; + u32 num_got; + u64 first_blkno; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_xattr_block *xblk; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &slot, &suballoc_bit_start, &num_got, + &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), + new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + /* Initialize ocfs2_xattr_block */ + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(slot); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + di->i_xattr_loc = cpu_to_le64(first_blkno); + ocfs2_journal_dirty(handle, inode_bh); + + *ret_bh = new_bh; + new_bh = NULL; + +end: + brelse(new_bh); + return ret; +} + /* * ocfs2_xattr_block_set() * @@ -2111,65 +2177,24 @@ static int ocfs2_xattr_block_set(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { struct buffer_head *new_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; handle_t *handle = ctxt->handle; struct ocfs2_xattr_block *xblk = NULL; - u16 slot, suballoc_bit_start; - u32 num_got; - u64 first_blkno; int ret; if (!xs->xattr_bh) { - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), - xs->inode_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - - ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1, - &slot, &suballoc_bit_start, &num_got, - &first_blkno); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - - new_bh = sb_getblk(inode->i_sb, first_blkno); - ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); - - ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), - new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { + ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, + ctxt->meta_ac, &new_bh); + if (ret) { mlog_errno(ret); goto end; } - /* Initialize ocfs2_xattr_block */ xs->xattr_bh = new_bh; - xblk = (struct ocfs2_xattr_block *)new_bh->b_data; - memset(xblk, 0, inode->i_sb->s_blocksize); - strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); - xblk->xb_suballoc_slot = cpu_to_le16(slot); - xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); - xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); - xblk->xb_blkno = cpu_to_le64(first_blkno); - + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; xs->header = &xblk->xb_attrs.xb_header; xs->base = (void *)xs->header; xs->end = (void *)xblk + inode->i_sb->s_blocksize; xs->here = xs->header->xh_entries; - - ret = ocfs2_journal_dirty(handle, new_bh); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - di->i_xattr_loc = cpu_to_le64(first_blkno); - ocfs2_journal_dirty(handle, xs->inode_bh); } else xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 35/42] ocfs2: Abstract ocfs2 xattr tree extend rec iteration process.
Currently we have ocfs2_iterate_xattr_buckets which can receive a para and a callback to iterate a series of bucket. It is good. But actually the 2 callers ocfs2_xattr_tree_list_index_block and ocfs2_delete_xattr_index_block are almost the same. The only difference is that the latter need to handle the extent record also. So add a new function named ocfs2_iterate_xattr_index_block. It can be given func callback which are used for exten record. So now we only have one iteration function for the xattr index block. Ane what's more, it is useful for our future reflink operations. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/xattr.c | 147 ++++++++++++++++++++++++++++-------------------------- 1 files changed, 76 insertions(+), 71 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e3d31f5..c67ba60 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -156,7 +156,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode, struct ocfs2_xattr_search *xs); static int ocfs2_xattr_tree_list_index_block(struct inode *inode, - struct ocfs2_xattr_tree_root *xt, + struct buffer_head *blk_bh, char *buffer, size_t buffer_size); @@ -169,8 +169,23 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_search *xs, struct ocfs2_xattr_set_ctxt *ctxt); -static int ocfs2_delete_xattr_index_block(struct inode *inode, - struct buffer_head *xb_bh); +typedef int (xattr_tree_rec_func)(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para); +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *root_bh, + xattr_tree_rec_func *rec_func, + void *para); +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para); + static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, u64 src_blk, u64 last_blk, u64 to_blk, unsigned int start_bucket, @@ -865,11 +880,9 @@ static int ocfs2_xattr_block_list(struct inode *inode, struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); - } else { - struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; - ret = ocfs2_xattr_tree_list_index_block(inode, xt, + } else + ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh, buffer, buffer_size); - } brelse(blk_bh); @@ -1795,7 +1808,10 @@ static int ocfs2_xattr_block_remove(struct inode *inode, struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); ret = ocfs2_remove_value_outside(inode, &vb, header); } else - ret = ocfs2_delete_xattr_index_block(inode, blk_bh); + ret = ocfs2_iterate_xattr_index_block(inode, + blk_bh, + ocfs2_rm_xattr_cluster, + NULL); return ret; } @@ -3292,22 +3308,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, return ret; } -static int ocfs2_xattr_tree_list_index_block(struct inode *inode, - struct ocfs2_xattr_tree_root *xt, - char *buffer, - size_t buffer_size) +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *blk_bh, + xattr_tree_rec_func *rec_func, + void *para) { - struct ocfs2_extent_list *el = &xt->xt_list; + struct ocfs2_xattr_block *xb + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; int ret = 0; u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; u64 p_blkno = 0; - struct ocfs2_xattr_tree_list xl = { - .buffer = buffer, - .buffer_size = buffer_size, - .result = 0, - }; - if (le16_to_cpu(el->l_next_free_rec) == 0) + if (!el->l_next_free_rec || !rec_func) return 0; while (name_hash > 0) { @@ -3315,15 +3328,14 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, &e_cpos, &num_clusters, el); if (ret) { mlog_errno(ret); - goto out; + break; } - ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, - ocfs2_list_xattr_bucket, - &xl); + ret = rec_func(inode, blk_bh, p_blkno, e_cpos, + num_clusters, para); if (ret) { mlog_errno(ret); - goto out; + break; } if (e_cpos == 0) @@ -3332,6 +3344,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, name_hash = e_cpos - 1; } + return ret; + +} + +static int ocfs2_list_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_list_xattr_bucket, para); +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct buffer_head *blk_bh, + char *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_list_xattr_tree_rec, &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = xl.result; out: return ret; @@ -4895,7 +4938,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, struct buffer_head *root_bh, u64 blkno, u32 cpos, - u32 len) + u32 len, + void *para) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -4907,6 +4951,13 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; + ret = ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_delete_xattr_in_bucket, NULL); + if (ret) { + mlog_errno(ret); + return ret; + } + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); ocfs2_init_dealloc_ctxt(&dealloc); @@ -5328,52 +5379,6 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, return ret; } -static int ocfs2_delete_xattr_index_block(struct inode *inode, - struct buffer_head *xb_bh) -{ - struct ocfs2_xattr_block *xb - (struct ocfs2_xattr_block *)xb_bh->b_data; - struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; - int ret = 0; - u32 name_hash = UINT_MAX, e_cpos, num_clusters; - u64 p_blkno; - - if (le16_to_cpu(el->l_next_free_rec) == 0) - return 0; - - while (name_hash > 0) { - ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, - &e_cpos, &num_clusters, el); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, - ocfs2_delete_xattr_in_bucket, - NULL); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_rm_xattr_cluster(inode, xb_bh, - p_blkno, e_cpos, num_clusters); - if (ret) { - mlog_errno(ret); - break; - } - - if (e_cpos == 0) - break; - - name_hash = e_cpos - 1; - } - -out: - return ret; -} - /* * Whenever we modify a xattr value root in the bucket(e.g, CoW * or change the extent record flag), we need to recalculate -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 36/42] ocfs2: Attach xattr clusters to refcount tree.
In ocfs2, when xattr's value is larger than OCFS2_XATTR_INLINE_SIZE, it will be kept outside of the blocks we store xattr entry. And they are stored in a b-tree also. So this patch try to attach all these clusters to refcount tree also. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 36 +++++-- fs/ocfs2/refcounttree.h | 6 + fs/ocfs2/xattr.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/xattr.h | 5 +- 4 files changed, 317 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7eaab1d..d680edb 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1708,11 +1708,12 @@ out: * Insert a new extent into refcount tree and mark a extent rec * as refcounted in the dinode tree. */ -static int ocfs2_add_refcount_flag(struct inode *inode, - struct ocfs2_extent_tree *di_et, - struct ocfs2_extent_tree *ref_et, - u32 cpos, u32 p_cluster, u32 num_clusters, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_extent_tree *ref_et, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post) { int ret; handle_t *handle; @@ -1729,6 +1730,8 @@ static int ocfs2_add_refcount_flag(struct inode *inode, credits += ocfs2_calc_extend_credits(inode->i_sb, ref_et->et_root_el, 0); + if (post) + credits += post->credits; handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1736,7 +1739,7 @@ static int ocfs2_add_refcount_flag(struct inode *inode, goto out; } - ret = ocfs2_mark_extent_refcounted(inode, di_et, handle, + ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, cpos, num_clusters, p_cluster, meta_ac, dealloc); if (ret) { @@ -1747,8 +1750,17 @@ static int ocfs2_add_refcount_flag(struct inode *inode, ret = __ocfs2_increase_refcount(handle, ref_et, p_cluster, num_clusters, meta_ac, dealloc); - if (ret) + if (ret) { mlog_errno(ret); + goto out_commit; + } + + /* handle some pos_cow case. */ + if (post && post->func) { + ret = post->func(inode, handle, post->para); + if (ret) + mlog_errno(ret); + } out_commit: ocfs2_commit_trans(osb, handle); @@ -1811,7 +1823,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, ret = ocfs2_add_refcount_flag(inode, &di_et, &ref_et, cpos - num_clusters, p_cluster, num_clusters, - &dealloc); + &dealloc, NULL); if (ret) { mlog_errno(ret); break; @@ -1824,6 +1836,14 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, */ ocfs2_extent_map_trunc(inode, 0); + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_xattr_attach_refcount_tree(inode, fe_bh, + &ref_et, &dealloc); + if (ret) + mlog_errno(ret); + } + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_bh); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 32ae584..0ad42c0 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -93,4 +93,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, struct buffer_head *ref_bh, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post); +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_extent_tree *ref_et, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c67ba60..b606604 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -5578,6 +5578,285 @@ out: } /* + * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root. + * The physical clusters will be added to refcount tree. + */ +static int ocfs2_xattr_value_attach_refcount(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_extent_tree *value_et, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *refcount) +{ + int ret = 0; + u32 clusters = le32_to_cpu(xv->xr_clusters); + u32 cpos, p_cluster, num_clusters; + struct ocfs2_extent_list *el = &xv->xr_list; + unsigned int ext_flags; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el, &ext_flags); + + cpos += num_clusters; + if ((ext_flags & OCFS2_EXT_REFCOUNTED)) + continue; + + BUG_ON(!p_cluster); + + ret = ocfs2_add_refcount_flag(inode, value_et, ref_et, + cpos - num_clusters, + p_cluster, num_clusters, + dealloc, refcount); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +/* + * Given a normal ocfs2_xattr_header, refcount all the entries which + * have value stored outside. + * Used for xattrs stored in inode and ocfs2_xattr_block. + */ +static int ocfs2_xattr_attach_refcount_normal(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_header *header, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_extent_tree et; + int i, ret = 0; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + xe = &header->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + xv = (struct ocfs2_xattr_value_root *)((void *)header + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + vb->vb_xv = xv; + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, xv, + &et, ref_et, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_xattr_inline_attach_refcount(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *) + (fe_bh->b_data + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + struct ocfs2_xattr_value_buf vb = { + .vb_bh = fe_bh, + .vb_access = ocfs2_journal_access_di, + }; + + return ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_et, dealloc); +} + +struct ocfs2_xattr_tree_value_refcount_para { + struct ocfs2_extent_tree *et; + struct ocfs2_cached_dealloc_ctxt *dealloc; +}; + +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh) +{ + int ret, block_off, name_offset; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + void *base; + + ret = ocfs2_xattr_bucket_get_name_value(sb, + bucket_xh(bucket), + offset, + &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + + base = bucket_block(bucket, block_off); + + *xv = (struct ocfs2_xattr_value_root *)(base + name_offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (bh) + *bh = bucket->bu_bhs[offset]; +out: + return ret; +} + +/* + * For a given xattr bucket, refcount all the entries which + * have value stored outside. + */ +static int ocfs2_xattr_bucket_value_refcount(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int i, ret = 0; + struct ocfs2_extent_tree et; + struct ocfs2_xattr_tree_value_refcount_para *ref + (struct ocfs2_xattr_tree_value_refcount_para *)para; + struct ocfs2_xattr_header *xh + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + struct ocfs2_post_refcount refcount = { + .credits = bucket->bu_blocks, + .para = bucket, + .func = ocfs2_xattr_bucket_post_refcount, + }; + struct ocfs2_post_refcount *p = NULL; + + /* We only need post_refcount if we support metaecc. */ + if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb))) + p = &refcount; + + mlog(0, "refcount bucket %llu, count = %u\n", + (unsigned long long)bucket_blkno(bucket), + le16_to_cpu(xh->xh_count)); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, + &vb.vb_xv, &vb.vb_bh); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_init_xattr_value_extent_tree(&et, + INODE_CACHE(inode), &vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv, + &et, ref->et, + ref->dealloc, p); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; + +} + +static int ocfs2_refcount_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_xattr_bucket_value_refcount, + para); +} + +static int ocfs2_xattr_block_attach_refcount(struct inode *inode, + struct buffer_head *blk_bh, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_xattr_block *xb + (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_et, dealloc); + } else { + struct ocfs2_xattr_tree_value_refcount_para para = { + .et = ref_et, + .dealloc = dealloc, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_refcount_xattr_tree_rec, + ¶); + } + + return ret; +} + +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct buffer_head *blk_bh = NULL; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh, + ref_et, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (!di->i_xattr_loc) + goto out; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_et, dealloc); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); +out: + + return ret; +} + +/* * 'security' attributes support */ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 5a1ebc7..22a7cee 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,5 +83,8 @@ struct ocfs2_xattr_value_buf { struct ocfs2_xattr_value_root *vb_xv; }; - +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_cached_dealloc_ctxt *dealloc); #endif /* OCFS2_XATTR_H */ -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 37/42] ocfs2: Call refcount tree remove process properly.
Now with xattr refcount support, we need to check whether we have xattr refcounted before we remove the refcount tree. Now the mechanism is: 1) Check whether i_clusters == 0, if no, exit. 2) check whether we have i_xattr_loc in dinode. if yes, exit. 2) Check whether we have inline xattr stored outside, if yes, exit. 4) Remove the tree. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 5 ----- fs/ocfs2/file.c | 3 +++ fs/ocfs2/inode.c | 7 +++++++ fs/ocfs2/refcounttree.c | 36 ++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 2 ++ fs/ocfs2/xattr.c | 23 +++++++++++++++++++++++ fs/ocfs2/xattr.h | 2 ++ 7 files changed, 73 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d2a058a..ef51d02 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7650,11 +7650,6 @@ bail: if (ref_tree) ocfs2_unlock_refcount_tree(osb, ref_tree, 1); - if (!status && OCFS2_I(inode)->ip_clusters == 0) { - /* remove the refcount tree. */ - status = ocfs2_remove_refcount_tree(inode, fe_bh); - } - ocfs2_schedule_truncate_log_flush(osb, 1); if (meta_ac) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 796be27..c36daa7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -59,6 +59,7 @@ #include "xattr.h" #include "acl.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -483,6 +484,8 @@ bail_unlock_sem: up_write(&OCFS2_I(inode)->ip_alloc_sem); bail: + if (!status && OCFS2_I(inode)->ip_clusters == 0) + status = ocfs2_try_remove_refcount_tree(inode, di_bh); mlog_exit(status); return status; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 748fd5a..1e3c006 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -52,6 +52,7 @@ #include "sysfile.h" #include "uptodate.h" #include "xattr.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -748,6 +749,12 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + status = ocfs2_remove_refcount_tree(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d680edb..c12fcb0 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -427,6 +427,42 @@ out: } /* + * Try to remove refcount tree. The mechanism is: + * 1) Check whether i_clusters == 0, if no, exit. + * 2) check whether we have i_xattr_loc in dinode. if yes, exit. + * 3) Check whether we have inline xattr stored outside, if yes, exit. + * 4) Remove the tree. + */ +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&oi->ip_xattr_sem); + down_write(&oi->ip_alloc_sem); + + if (oi->ip_clusters) + goto out; + + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) + goto out; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && + ocfs2_has_inline_xattr_value_outside(inode, di)) + goto out; + + ret = ocfs2_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); +out: + up_write(&oi->ip_alloc_sem); + up_write(&oi->ip_xattr_sem); + return 0; +} + +/* * Given a cpos and len, try to find the refcount record which contains cpos. * 1. If cpos can be found in one refcount record, return the record. * 2. If cpos can't be found, return a fake record which start from cpos diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 0ad42c0..8939e00 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -47,6 +47,8 @@ int ocfs2_set_refcount_tree(struct inode *inode, struct buffer_head *di_bh, u64 blkno); int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh); int ocfs2_decrease_refcount(struct inode *inode, handle_t *handle, u32 cpos, u32 len, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index b606604..d3fd0de 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -835,6 +835,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode, return result; } +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_xattr_header *xh; + int i; + + xh = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) + if (!ocfs2_xattr_is_local(&xh->xh_entries[i])) + return 1; + + return 0; +} + static int ocfs2_xattr_ibody_list(struct inode *inode, struct ocfs2_dinode *di, char *buffer, @@ -2892,10 +2909,16 @@ int ocfs2_xattr_set(struct inode *inode, if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); + cleanup: if (ref_tree) ocfs2_unlock_refcount_tree(osb, ref_tree, 1); up_write(&OCFS2_I(inode)->ip_xattr_sem); + if (!value && !ret) { + ret = ocfs2_try_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); + } ocfs2_inode_unlock(inode, 1); cleanup_nolock: brelse(di_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 22a7cee..e5e67e6 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *, int, const char *, const void *, size_t, int, struct ocfs2_alloc_context *, struct ocfs2_alloc_context *); +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di); int ocfs2_xattr_remove(struct inode *, struct buffer_head *); int ocfs2_init_security_get(struct inode *, struct inode *, struct ocfs2_security_xattr_info *); -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 38/42] ocfs2: Create an xattr indexed block if needed.
With reflink, there is a need that we create a new xattr indexed block from the very beginning. So add a new parameter for ocfs2_create_xattr_block. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/xattr.c | 16 ++++++++++++++-- 1 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d3fd0de..556fae0 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2136,7 +2136,8 @@ static int ocfs2_create_xattr_block(handle_t *handle, struct inode *inode, struct buffer_head *inode_bh, struct ocfs2_alloc_context *meta_ac, - struct buffer_head **ret_bh) + struct buffer_head **ret_bh, + int indexed) { int ret; u16 slot, suballoc_bit_start; @@ -2182,6 +2183,17 @@ static int ocfs2_create_xattr_block(handle_t *handle, xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); xblk->xb_blkno = cpu_to_le64(first_blkno); + if (indexed) { + struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16( + ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); + } + ret = ocfs2_journal_dirty(handle, new_bh); if (ret < 0) { mlog_errno(ret); @@ -2216,7 +2228,7 @@ static int ocfs2_xattr_block_set(struct inode *inode, if (!xs->xattr_bh) { ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, - ctxt->meta_ac, &new_bh); + ctxt->meta_ac, &new_bh, 0); if (ret) { mlog_errno(ret); goto end; -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 39/42] ocfs2: Add reflink support for xattr.
Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 34 ++- fs/ocfs2/refcounttree.h | 5 + fs/ocfs2/xattr.c | 893 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ocfs2/xattr.h | 4 + 4 files changed, 923 insertions(+), 13 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c12fcb0..400e613 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -563,11 +563,11 @@ out: return ret; } -static int __ocfs2_increase_refcount(handle_t *handle, - struct ocfs2_extent_tree *et, - u32 cpos, u32 len, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0, index; struct ocfs2_extent_rec rec; @@ -1783,9 +1783,9 @@ int ocfs2_add_refcount_flag(struct inode *inode, goto out_commit; } - ret = __ocfs2_increase_refcount(handle, ref_et, - p_cluster, num_clusters, - meta_ac, dealloc); + ret = ocfs2_increase_refcount(handle, ref_et, + p_cluster, num_clusters, + meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; @@ -1941,9 +1941,9 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode, break; } - ret = __ocfs2_increase_refcount(handle, ref_et, - p_cluster, num_clusters, - meta_ac, dealloc); + ret = ocfs2_increase_refcount(handle, ref_et, + p_cluster, num_clusters, + meta_ac, dealloc); if (ret) { mlog_errno(ret); break; @@ -2119,9 +2119,17 @@ static int __ocfs2_reflink(struct dentry *old_dentry, } ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh); - if (ret) + if (ret) { mlog_errno(ret); + goto inode_unlock; + } + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh); + if (ret) + mlog_errno(ret); + } +inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: @@ -2189,12 +2197,14 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, goto out_unlock; } + down_write(&OCFS2_I(inode)->ip_xattr_sem); down_write(&OCFS2_I(inode)->ip_alloc_sem); DQUOT_INIT(dir); error = __ocfs2_reflink(old_dentry, old_bh, new_orphan_inode); up_write(&OCFS2_I(inode)->ip_alloc_sem); if (error) mlog_errno(error); + up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); brelse(old_bh); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 8939e00..23cb3d7 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -101,4 +101,9 @@ int ocfs2_add_refcount_flag(struct inode *inode, u32 cpos, u32 p_cluster, u32 num_clusters, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_post_refcount *post); +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 556fae0..37c3bef 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -5745,7 +5745,7 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, OCFS2_XATTR_SIZE(xe->xe_name_len)); if (bh) - *bh = bucket->bu_bhs[offset]; + *bh = bucket->bu_bhs[block_off]; out: return ret; } @@ -5892,6 +5892,897 @@ out: } /* + * Store the information we need in xattr reflink. + * old_bh and new_bh are inode bh for the old and new inode. + */ +struct ocfs2_xattr_reflink { + struct inode *old_inode; + struct inode *new_inode; + struct buffer_head *old_bh; + struct buffer_head *new_bh; + struct ocfs2_extent_tree *ref_et; + struct ocfs2_cached_dealloc_ctxt *dealloc; +}; + +/* + * Given a xattr header and xe offset, + * return the proper xv and the corresponding bh. + * xattr in inode, block and xattr tree have different implementaions. + */ +typedef int (get_xattr_value_root)(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para); + +/* + * Calculate all the xattr value root metadata stored in this xattr header and + * credits we need if we create them from the scratch. + * We use get_xattr_value_root so that all types of xattr container can use it. + */ +static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int *metas, int *credits, + get_xattr_value_root *func, + void *para) +{ + int i, ret = 0; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + *metas += le16_to_cpu(xv->xr_list.l_tree_depth) * + le16_to_cpu(xv->xr_list.l_next_free_rec) + 1; + + *credits += ocfs2_calc_extend_credits(sb, + &def_xv.xv.xr_list, + le32_to_cpu(xv->xr_clusters)); + } + + return ret; +} + +/* Used by xattr inode and block to return the right xv and buffer_head. */ +static int ocfs2_get_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + + *xv = (struct ocfs2_xattr_value_root *)((void *)xh + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (ret_bh) + *ret_bh = bh; + + return 0; +} + +/* + * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * It is only used for inline xattr and xattr block. + */ +static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, + struct ocfs2_xattr_header *xh, + struct ocfs2_extent_tree *ref_et, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, meta_add = 0; + + *credits = 0; + + ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh, + &meta_add, credits, + ocfs2_get_xattr_value_root, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + /* + * We have to add credits for modifying all the metadata in the + * refount tree. + */ + *credits += le16_to_cpu(ref_et->et_root_el->l_tree_depth) * + le16_to_cpu(ref_et->et_root_el->l_next_free_rec) + 1; + + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Given a xattr header, reflink all the xattrs in this container. + * It can be used for inode, block and bucket. + * + * NOTE: + * Before we call this function, the caller has memcpy the xattr in + * old_xh to the new_xh. + */ +static int ocfs2_reflink_xattr_header(handle_t *handle, + struct ocfs2_xattr_reflink *args, + struct buffer_head *old_bh, + struct ocfs2_xattr_header *xh, + struct buffer_head *new_bh, + struct ocfs2_xattr_header *new_xh, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_alloc_context *meta_ac, + get_xattr_value_root *func, + void *para) +{ + int ret = 0, i; + struct super_block *sb = args->old_inode->i_sb; + struct buffer_head *value_bh; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_root *xv, *new_xv; + struct ocfs2_extent_tree data_et; + struct ocfs2_extent_rec rec; + u32 clusters, cpos, p_cluster, num_clusters; + unsigned int ext_flags = 0; + + mlog(0, "reflink xattr in container %llu, count = %u\n", + (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count)); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, old_bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + ret = func(sb, new_bh, new_xh, i, &new_xv, &value_bh, para); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * For the xattr which has l_tree_depth = 0, all the extent + * recs have already be copied to the new xh with the + * propriate OCFS2_EXT_REFCOUNTED flag we just need to + * increase the refount count int the refcount tree. + * + * For the xattr which has l_tree_depth > 0, we need + * to initialize it to the empty default value root, + * and then insert the extents one by one. + */ + if (xv->xr_list.l_tree_depth) { + memcpy(new_xv, &def_xv, sizeof(def_xv)); + vb->vb_xv = new_xv; + vb->vb_bh = value_bh; + ocfs2_init_xattr_value_extent_tree(&data_et, + INODE_CACHE(args->new_inode), vb); + } + + clusters = le32_to_cpu(xv->xr_clusters); + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(args->old_inode, + cpos, + &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!p_cluster); + + if (xv->xr_list.l_tree_depth) { + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(cpos); + rec.e_int_clusters = cpu_to_le32(num_clusters); + rec.e_blkno = cpu_to_le64( + ocfs2_clusters_to_blocks( + args->old_inode->i_sb, + p_cluster)); + rec.e_flags = ext_flags; + + ret = ocfs2_insert_extent(handle, &data_et, + &rec, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_increase_refcount(handle, args->ref_et, + p_cluster, num_clusters, + meta_ac, args->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + } + } + +out: + return ret; +} + +static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data; + int inline_size = le16_to_cpu(di->i_xattr_inline_size); + int header_off = osb->sb->s_blocksize - inline_size; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *) + (args->old_bh->b_data + header_off); + struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *) + (args->new_bh->b_data + header_off); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_inode_info *new_oi; + struct ocfs2_dinode *new_di; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = args->new_bh, + .vb_access = ocfs2_journal_access_di, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_et, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode), + args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(args->new_bh->b_data + header_off, + args->old_bh->b_data + header_off, inline_size); + + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + new_di->i_xattr_inline_size = cpu_to_le16(inline_size); + + ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh, + args->new_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_oi = OCFS2_I(args->new_inode); + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_create_empty_xattr_block(struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **ret_bh, + int indexed) +{ + int ret; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + mlog(0, "create new xattr block for inode %llu, index = %d\n", + (unsigned long long)fe_bh->b_blocknr, indexed); + ret = ocfs2_create_xattr_block(handle, inode, fe_bh, + meta_ac, ret_bh, indexed); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode); + struct ocfs2_dinode *new_di; + struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb); + int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + struct ocfs2_xattr_block *xb + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_block *new_xb + (struct ocfs2_xattr_block *)new_blk_bh->b_data; + struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = new_blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_et, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* One more credits in case we need to add xattr flags in new inode. */ + handle = ocfs2_start_trans(osb, credits + 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + ret = ocfs2_journal_access_di(handle, + INODE_CACHE(args->new_inode), + args->new_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode), + new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off, + osb->sb->s_blocksize - header_off); + + ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh, + new_blk_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_journal_dirty(handle, new_blk_bh); + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +struct ocfs2_reflink_xattr_tree_args { + struct ocfs2_xattr_reflink *reflink; + struct buffer_head *old_blk_bh; + struct buffer_head *new_blk_bh; + struct ocfs2_xattr_bucket *old_bucket; + struct ocfs2_xattr_bucket *new_bucket; +}; + +/* + * NOTE: + * We have to handle the case that both old bucket and new bucket + * will call this function to get the right ret_bh. + * So The caller must give us the right bh. + */ +static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_reflink_xattr_tree_args *args + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_xattr_bucket *bucket; + + if (bh == args->old_bucket->bu_bhs[0]) + bucket = args->old_bucket; + else + bucket = args->new_bucket; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +struct ocfs2_value_tree_metas { + int num_metas; + int credits; +}; + +static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_bucket *bucket + (struct ocfs2_xattr_bucket *)para; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +static int ocfs2_calc_value_tree_metas(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + struct ocfs2_value_tree_metas *metas + (struct ocfs2_value_tree_metas *)para; + struct ocfs2_xattr_header *xh + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + + /* Add the credits for this bucket first. */ + metas->credits += bucket->bu_blocks; + return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0], + xh, &metas->num_metas, + &metas->credits, + ocfs2_value_tree_metas_in_bucket, + bucket); +} + +/* + * Given a xattr extent rec starting from blkno and having len clusters, + * iterate all the buckets calculate how much metadata we need for reflinking + * all the ocfs2_xattr_value_root and lock the allocators accordingly. + */ +static int ocfs2_lock_reflink_xattr_rec_allocators( + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *xt_et, + u64 blkno, u32 len, int *credits, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac) +{ + int ret, num_free_extents; + struct ocfs2_value_tree_metas metas; + struct ocfs2_extent_tree *ref_et = args->reflink->ref_et; + struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb); + + memset(&metas, 0, sizeof(metas)); + + ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len, + ocfs2_calc_value_tree_metas, &metas); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have to add credits for modifying all the metadata in the + * refount tree. + */ + *credits = metas.credits + + le16_to_cpu(ref_et->et_root_el->l_tree_depth) * + le16_to_cpu(ref_et->et_root_el->l_next_free_rec) + 1; + + /* count in the xattr tree change. */ + num_free_extents = ocfs2_num_free_extents(osb, xt_et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < len) + metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el); + + *credits += ocfs2_calc_extend_credits(osb->sb, + xt_et->et_root_el, len); + + if (metas.num_metas) { + ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (len) { + ret = ocfs2_reserve_clusters(osb, len, data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + u64 blkno, u64 new_blkno, u32 clusters, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_reflink_xattr_tree_args *args) +{ + int i, j, ret = 0; + struct super_block *sb = args->reflink->old_inode->i_sb; + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); + u32 num_buckets = clusters * bpc; + int bpb = args->old_bucket->bu_blocks; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + + for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) { + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu( + bucket_xh(args->old_bucket)->xh_num_buckets); + + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + for (j = 0; j < bpb; j++) + memcpy(bucket_block(args->new_bucket, j), + bucket_block(args->old_bucket, j), + sb->s_blocksize); + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + + ret = ocfs2_reflink_xattr_header(handle, args->reflink, + args->old_bucket->bu_bhs[0], + bucket_xh(args->old_bucket), + args->new_bucket->bu_bhs[0], + bucket_xh(args->new_bucket), + &vb, meta_ac, + ocfs2_get_reflink_xattr_value_root, + args); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * Re-access and dirty the bucket to calculate metaecc. + * Because we may extend the transaction in reflink_xattr_header + * which will let the already accessed block gone. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + } + + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + return ret; +} +/* + * Create the same xattr extent record in the new inode's xattr tree. + */ +static int ocfs2_reflink_xattr_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para) +{ + int ret, credits = 0; + u32 p_cluster, num_clusters; + u64 new_blkno; + handle_t *handle; + struct ocfs2_reflink_xattr_tree_args *args + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_rec rec; + + ocfs2_init_xattr_tree_extent_tree(&et, + INODE_CACHE(args->reflink->new_inode), + args->new_blk_bh); + + ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno, + len, &credits, + &meta_ac, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_clusters(osb, handle, data_ac, + len, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster); + + mlog(0, "reflink xattr buckets %llu to %llu, len %u\n", + (unsigned long long)blkno, (unsigned long long)new_blkno, len); + ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(cpos); + rec.e_leaf_clusters = cpu_to_le16(len); + rec.e_blkno = cpu_to_le64(new_blkno); + + mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", + (unsigned long long)new_blkno, len, cpos); + ret = ocfs2_insert_extent(handle, &et, &rec, meta_ac); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + return ret; +} + +/* + * Create reflinked xattr buckets. + * We will add bucket one by one, and refcount all the xattrs in the bucket + * if they are stored outside. + */ +static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret; + struct ocfs2_reflink_xattr_tree_args para; + + memset(¶, 0, sizeof(para)); + para.reflink = args; + para.old_blk_bh = blk_bh; + para.new_blk_bh = new_blk_bh; + + para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode); + if (!para.old_bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode); + if (!para.new_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh, + ocfs2_reflink_xattr_rec, + ¶); + if (ret) + mlog_errno(ret); + +out: + ocfs2_xattr_bucket_free(para.old_bucket); + ocfs2_xattr_bucket_free(para.new_bucket); + return ret; +} + +static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh) +{ + int ret, indexed = 0; + struct buffer_head *new_blk_bh = NULL; + struct ocfs2_xattr_block *xb + (struct ocfs2_xattr_block *)blk_bh->b_data; + + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) + indexed = 1; + + ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh, + &new_blk_bh, indexed); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) + ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); + else + ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); + if (ret) + mlog_errno(ret); + +out: + brelse(new_blk_bh); + return ret; +} + +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh) +{ + int ret; + struct ocfs2_xattr_reflink args; + struct ocfs2_inode_info *oi = OCFS2_I(old_inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_bh = NULL; + struct ocfs2_extent_tree ref_et; + + ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_refcount_extent_tree(&ref_et, &ref_tree->rf_ci, ref_bh); + + ocfs2_init_dealloc_ctxt(&dealloc); + + args.old_inode = old_inode; + args.new_inode = new_inode; + args.old_bh = old_bh; + args.new_bh = new_bh; + args.ref_et = &ref_et; + args.dealloc = &dealloc; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_reflink_xattr_inline(&args); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + } + + if (!di->i_xattr_loc) + goto out_unlock; + + ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_reflink_xattr_in_block(&args, blk_bh); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); + +out_unlock: + ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb), + ref_tree, 1); + brelse(ref_bh); + + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1); + ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc); + } + +out: + return ret; +} + +/* * 'security' attributes support */ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index e5e67e6..3a75cb3 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -89,4 +89,8 @@ int ocfs2_xattr_attach_refcount_tree(struct inode *inode, struct buffer_head *fe_bh, struct ocfs2_extent_tree *ref_et, struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh); #endif /* OCFS2_XATTR_H */ -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 40/42] ocfs2: Modify removing xattr process for refcount.
The old xattr value remove is quite simple, it just erase the tree and free the clusters. But as we have added refcount support, The process is a little complicated. We have to lock the refcount tree at the beginning, what's more, we may split the refcount tree in some cases, so meta/credits are needed. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/xattr.c | 170 ++++++++++++++++++++++++++++++++++++++++++----------- 1 files changed, 134 insertions(+), 36 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 37c3bef..cfc71c2 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -198,6 +198,11 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode, struct ocfs2_refcount_tree **ref_tree, int *meta_need, int *credits); +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -1746,51 +1751,106 @@ out: return ret; } +/* + * In xattr remove, if it is stored outside and refcounted, we may have + * the chance to split the refcount tree. So need the allocators. + */ +static int ocfs2_lock_xattr_remove_allocators(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_extent_tree *ref_et, + struct ocfs2_alloc_context **meta_ac, + int *ref_credits) +{ + int ret; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + *ref_credits = 0; + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_lock_allocators(inode, ref_et, 0, + le32_to_cpu(xv->xr_clusters), + NULL, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ref_credits = ocfs2_calc_extend_credits(inode->i_sb, + ref_et->et_root_el, + le32_to_cpu(xv->xr_clusters)); +out: + return ret; +} + static int ocfs2_remove_value_outside(struct inode*inode, struct ocfs2_xattr_value_buf *vb, - struct ocfs2_xattr_header *header) + struct ocfs2_xattr_header *header, + struct ocfs2_extent_tree *ref_et) { - int ret = 0, i; + int ret = 0, i, ref_credits; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + void *val; ocfs2_init_dealloc_ctxt(&ctxt.dealloc); - ctxt.handle = ocfs2_start_trans(osb, - ocfs2_remove_extent_credits(osb->sb)); - if (IS_ERR(ctxt.handle)) { - ret = PTR_ERR(ctxt.handle); - mlog_errno(ret); - goto out; - } - for (i = 0; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; - if (!ocfs2_xattr_is_local(entry)) { - void *val; + if (ocfs2_xattr_is_local(entry)) + continue; - val = (void *)header + - le16_to_cpu(entry->xe_name_offset); - vb->vb_xv = (struct ocfs2_xattr_value_root *) - (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); - ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); - if (ret < 0) { - mlog_errno(ret); - break; - } + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + vb->vb_xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + + ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv, + ref_et, &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, ref_credits + + ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); + if (ret < 0) { + mlog_errno(ret); + break; + } + + ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; } } - ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); -out: return ret; } static int ocfs2_xattr_ibody_remove(struct inode *inode, - struct buffer_head *di_bh) + struct buffer_head *di_bh, + struct ocfs2_extent_tree *ref_et) { struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; @@ -1805,13 +1865,14 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode, ((void *)di + inode->i_sb->s_blocksize - le16_to_cpu(di->i_xattr_inline_size)); - ret = ocfs2_remove_value_outside(inode, &vb, header); + ret = ocfs2_remove_value_outside(inode, &vb, header, ref_et); return ret; } static int ocfs2_xattr_block_remove(struct inode *inode, - struct buffer_head *blk_bh) + struct buffer_head *blk_bh, + struct ocfs2_extent_tree *ref_et) { struct ocfs2_xattr_block *xb; int ret = 0; @@ -1823,18 +1884,19 @@ static int ocfs2_xattr_block_remove(struct inode *inode, xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); - ret = ocfs2_remove_value_outside(inode, &vb, header); + ret = ocfs2_remove_value_outside(inode, &vb, header, ref_et); } else ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, ocfs2_rm_xattr_cluster, - NULL); + ref_et); return ret; } static int ocfs2_xattr_free_block(struct inode *inode, - u64 block) + u64 block, + struct ocfs2_extent_tree *ref_et) { struct inode *xb_alloc_inode; struct buffer_head *xb_alloc_bh = NULL; @@ -1852,7 +1914,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, goto out; } - ret = ocfs2_xattr_block_remove(inode, blk_bh); + ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_et); if (ret < 0) { mlog_errno(ret); goto out; @@ -1912,6 +1974,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_refcount_tree *ref_tree = NULL; + struct buffer_head *ref_bh = NULL; + struct ocfs2_extent_tree ref_et; handle_t *handle; int ret; @@ -1921,8 +1986,22 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return 0; + memset(&ref_et, 0, sizeof(ref_et)); + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_refcount_extent_tree(&ref_et, + &ref_tree->rf_ci, ref_bh); + } + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { - ret = ocfs2_xattr_ibody_remove(inode, di_bh); + ret = ocfs2_xattr_ibody_remove(inode, di_bh, &ref_et); if (ret < 0) { mlog_errno(ret); goto out; @@ -1931,7 +2010,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (di->i_xattr_loc) { ret = ocfs2_xattr_free_block(inode, - le64_to_cpu(di->i_xattr_loc)); + le64_to_cpu(di->i_xattr_loc), + &ref_et); if (ret < 0) { mlog_errno(ret); goto out; @@ -1965,6 +2045,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: + if (ref_tree) + ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1); + brelse(ref_bh); return ret; } @@ -4987,7 +5070,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, struct ocfs2_extent_tree et; ret = ocfs2_iterate_xattr_buckets(inode, blkno, len, - ocfs2_delete_xattr_in_bucket, NULL); + ocfs2_delete_xattr_in_bucket, para); if (ret) { mlog_errno(ret); return ret; @@ -5375,7 +5458,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, struct ocfs2_xattr_bucket *bucket, void *para) { - int ret = 0; + int ret = 0, ref_credits; struct ocfs2_xattr_header *xh = bucket_xh(bucket); u16 i; struct ocfs2_xattr_entry *xe; @@ -5383,7 +5466,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; int credits = ocfs2_remove_extent_credits(osb->sb) + ocfs2_blocks_per_xattr_bucket(inode->i_sb); - + struct ocfs2_xattr_value_root *xv; + struct ocfs2_extent_tree *ref_et + (struct ocfs2_extent_tree *)para; ocfs2_init_dealloc_ctxt(&ctxt.dealloc); @@ -5392,7 +5477,14 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, if (ocfs2_xattr_is_local(xe)) continue; - ctxt.handle = ocfs2_start_trans(osb, credits); + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, + i, &xv, NULL); + + ret = ocfs2_lock_xattr_remove_allocators(inode, xv, + ref_et, &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); @@ -5403,12 +5495,18 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, i, 0, &ctxt); ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; + } if (ret) { mlog_errno(ret); break; } } + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); return ret; -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 41/42] ocfs2: Make transaction extend more efficient.
In ocfs2_extend_rotate_transaction, op_credits is the orignal credits in the handle and we only want to extend the credits for the rotation, but the old solution always double it. It is harmless for some minor operations, but for actions like reflink we may rotate tree many times and cause the credits increase dramatically. So this patch try to only increase the desired credits. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/alloc.c | 12 ++++++++++-- 1 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ef51d02..e9a5395 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2438,10 +2438,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int op_credits, struct ocfs2_path *path) { + int ret; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) - return ocfs2_extend_trans(handle, credits); + if (handle->h_buffer_credits < credits) { + ret = ocfs2_extend_trans(handle, + credits - handle->h_buffer_credits); + if (ret) + return ret; + + if (unlikely(handle->h_buffer_credits < credits)) + return ocfs2_extend_trans(handle, credits); + } return 0; } -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-02 23:46 UTC
[Ocfs2-devel] [PATCH 42/42] ocfs2: Enable refcount tree support.
Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/ocfs2_fs.h | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index fa550b8..9f597a3 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -96,7 +96,8 @@ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ - | OCFS2_FEATURE_INCOMPAT_META_ECC) + | OCFS2_FEATURE_INCOMPAT_META_ECC \ + | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) -- 1.6.2.rc2.16.gf474c
Tao Ma
2009-Apr-03 08:37 UTC
[Ocfs2-devel] [PATCH 00/42] ocfs2: Add reflink file support. V2
Hi all, Change from v1 to v2: bug fix and metadata/credits reservation improvement. The general information for reflink, please see http://oss.oracle.com/osswiki/OCFS2/DesignDocs/Reflink. For the design doc, please see http://oss.oracle.com/osswiki/OCFS2/DesignDocs/RefcountTrees http://oss.oracle.com/osswiki/OCFS2/DesignDocs/ReflinkOperation http://oss.oracle.com/osswiki/OCFS2/DesignDocs/ReflinkUses The patch set is based on Joel's work of "ocfs2: Detach ocfs2 metadata I/O from struct node" which can be found at http://oss.oracle.com/pipermail/ocfs2-devel/2009-February/003926.html. Enjoy it. Regards, Tao