Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 03/20] ocfs2: Re-order iput in ocfs2_drop_dentry_lock
Mainline commit 9f70968af3e6e21612e06e153aa71c62dee5a09b Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Thu, 18 Oct 2007 12:36:10 -0700 Do this to avoid a theoretical (I haven't seen this in practice) race where the downconvert thread might drop the dentry lock, allowing a remote unlink to proceed before dropping the inode locks. This could bounce access to the orphan dir between nodes. There doesn't seem to be a need to do the same in ocfs2_dentry_iput() as that's never called for the last ref drop from the downconvert thread. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/dcache.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 014e739..9e76e3c 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -318,9 +318,9 @@ out_attach: static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - iput(dl->dl_inode); kfree(dl); } -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 10/20] ocfs2: Filter -ENOSPC in mlog_errno()
Mainline commit ef9f86ceb63f2803c9aada249986b84d2f99c635 Author: Mark Fasheh <mfasheh@ca-build8.us.oracle.com> Date: Mon, 19 Nov 2007 18:31:17 -0800 It's almost never worth printing in that situation and we keep forgetting to manually filter it out. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/cluster/masklog.h | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 75cd877..4c32d1c 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -212,7 +212,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #define mlog_errno(st) do { \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ - _st != AOP_TRUNCATED_PAGE) \ + _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ } while (0) -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 02/20] ocfs2: Create locks at initially requested level
Mainline commit 019d1b2247c6898589560c6f3b3e7ec280b0010a Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Fri, 5 Oct 2007 12:09:05 -0700 If we have not yet created a cluster lock, ocfs2_cluster_lock() will first create it at NLMODE, and then convert the lock to either PRMODE or EXMODE (whichever is requested). Change ocfs2_cluster_lock() to just create the lock at the initially requested level. ocfs2_locking_ast() handles this case fine, so the only update required was in setup of locking state. This should reduce the number of network messages required for a new lock by one, providing an incremental performance enhancement. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/dlmglue.c | 23 +++++++++-------------- 1 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index ef09fd2..4e97dcc 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -980,18 +980,6 @@ again: goto unlock; } - if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { - /* lock has not been created yet. */ - spin_unlock_irqrestore(&lockres->l_lock, flags); - - ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - goto again; - } - if (lockres->l_flags & OCFS2_LOCK_BLOCKED && !ocfs2_may_continue_on_blocked_lock(lockres, level)) { /* is the lock is currently blocked on behalf of @@ -1006,7 +994,14 @@ again: mlog(ML_ERROR, "lockres %s has action %u pending\n", lockres->l_name, lockres->l_action); - lockres->l_action = OCFS2_AST_CONVERT; + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + lockres->l_action = OCFS2_AST_ATTACH; + lkm_flags &= ~LKM_CONVERT; + } else { + lockres->l_action = OCFS2_AST_CONVERT; + lkm_flags |= LKM_CONVERT; + } + lockres->l_requested = level; lockres_or_flags(lockres, OCFS2_LOCK_BUSY); spin_unlock_irqrestore(&lockres->l_lock, flags); @@ -1021,7 +1016,7 @@ again: status = dlmlock(osb->dlm, level, &lockres->l_lksb, - lkm_flags|LKM_CONVERT, + lkm_flags, lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, ocfs2_locking_ast, -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 11/20] ocfs2: log valid inode # on bad inode
Mainline commit a46043e08f300982c51df317e2f8fb919dedadcd Author: Mark Fasheh <mfasheh@ca-build8.us.oracle.com> Date: Mon, 19 Nov 2007 18:40:16 -0800 If the inode block isn't valid then we don't want to print the value from that, instead print the block number which was passed in (which should always be correct). Also, turn this into a debug print for now - folks who hit an actual problem always have other logs indicating what the source is. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/inode.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 9e3e7df..ebb2bbe 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -455,8 +455,8 @@ static int ocfs2_read_locked_inode(struct inode *inode, status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, + mlog(0, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)args->fi_blkno, 7, fe->i_signature); goto bail; } -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 07/20] ocfs2: fix rename vs unlink race
Mainline commit e325a88f17196f18888f6e1426eb9fe3b4346d28 Author: Srinivas Eeda <srinivas.eeda@oracle.com> Date: Wed, 31 Oct 2007 16:49:43 -0700 If another node unlinks the destination while ocfs2_rename() is waiting on a cluster lock, ocfs2_rename() simply logs an error and continues. This causes a crash because the renaming node is now trying to delete a non-existent inode. The correct solution is to return -ENOENT. Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/namei.c | 13 ++++++++++--- 1 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6d822b7..f9533f7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1105,9 +1105,16 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (!new_de && new_inode) - mlog(ML_ERROR, "inode %lu does not exist in it's parent " - "directory!", new_inode->i_ino); + if (!new_de && new_inode) { + /* + * Target was unlinked by another node while we were + * waiting to get to ocfs2_rename(). There isn't + * anything we can do here to help the situation, so + * bubble up the appropriate error. + */ + status = -ENOENT; + goto bail; + } /* In case we need to overwrite an existing file, we blow it * away first */ -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 15/20] ocfs2: reverse inline-data truncate args
Mainline commit b1967d0eddeef4869ee283e692735cb994f3745a Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Tue, 20 Nov 2007 11:56:39 -0800 ocfs2_truncate() and ocfs2_remove_inode_range() had reversed their "set i_size" arguments to ocfs2_truncate_inline(). Fix things so that truncate sets i_size, and punching a hole ignores it. This exposed a problem where punching a hole in an inline-data file wasn't updating the page cache, so fix that too. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/file.c | 19 +++++++++++++++---- 1 files changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a6a93b1..6d1ef46 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -403,7 +403,7 @@ static int ocfs2_truncate_file(struct inode *inode, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { status = ocfs2_truncate_inline(inode, di_bh, new_i_size, - i_size_read(inode), 0); + i_size_read(inode), 1); if (status) mlog_errno(status); @@ -1525,6 +1525,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_cached_dealloc_ctxt dealloc; + struct address_space *mapping = inode->i_mapping; ocfs2_init_dealloc_ctxt(&dealloc); @@ -1533,10 +1534,20 @@ static int ocfs2_remove_inode_range(struct inode *inode, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ret = ocfs2_truncate_inline(inode, di_bh, byte_start, - byte_start + byte_len, 1); - if (ret) + byte_start + byte_len, 0); + if (ret) { mlog_errno(ret); - return ret; + goto out; + } + /* + * There's no need to get fancy with the page cache + * truncate of an inline-data inode. We're talking + * about less than a page here, which will be cached + * in the dinode buffer anyway. + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + goto out; } trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 18/20] ocfs2: Allow for debugging of transaction extends
Mainline commit 0879c584ffcccd50a8d0f72cab3a51702613f901 Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Mon, 3 Dec 2007 16:42:19 -0800 The nastiest cases of transaction extends are also the rarest. We can expose them more quickly at the expense of performance by going straight to the journal_restart() in ocfs2_extend_trans(). Wrap things in OCFS2_DEBUG_FS so that we only do this when "expensive debugging" is turned on. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/journal.c | 4 ++++ 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 909a344..9988abc 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -193,11 +193,15 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); +#ifdef OCFS2_DEBUG_FS + status = 1; +#else status = journal_extend(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; } +#endif if (status > 0) { mlog(0, "journal_extend failed, trying journal_restart\n"); -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 16/20] ocfs2: fix exit-while-locked bug in ocfs2_queue_orphans()
Mainline commit a86370fbb65a0a2cb21d28bf25a748f6cc04385b Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Mon, 3 Dec 2007 14:06:23 -0800 We're holding the cluster lock when a failure might happen in ocfs2_dir_foreach() so it needs to be released. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/journal.c | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index aafd0a3..909a344 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1277,11 +1277,12 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, ocfs2_orphan_filldir); if (status) { mlog_errno(status); - goto out; + goto out_cluster; } *head = priv.head; +out_cluster: ocfs2_meta_unlock(orphan_dir_inode, 0); out: mutex_unlock(&orphan_dir_inode->i_mutex); -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 08/20] ocfs2: Reset journal parameters after s_mount_opt update
Mainline commit e001e796e47d29c470de6c2cd36400e03c66118b Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Wed, 7 Nov 2007 14:21:45 -0800 Right now we're just setting them from the existing parameters, not the new ones that a remount specified. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/super.c | 6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 43147c0..39cc000 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -442,14 +442,14 @@ unlock_osb: } if (!ret) { - if (!ocfs2_is_hard_readonly(osb)) - ocfs2_set_journal_params(osb); - /* Only save off the new mount options in case of a successful * remount. */ osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; + + if (!ocfs2_is_hard_readonly(osb)) + ocfs2_set_journal_params(osb); } out: return ret; -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 14/20] ocfs2: Fix comparison in ocfs2_size_fits_inline_data()
Mainline commit 0d8a4e0cd688ad0de6430ce3425c7849cfec1c2d Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Tue, 20 Nov 2007 11:48:41 -0800 This was causing us to prematurely push out inline data by one byte. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/aops.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4999180..25338ab 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1514,7 +1514,7 @@ int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size) { struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - if (new_size < le16_to_cpu(di->id2.i_data.id_count)) + if (new_size <= le16_to_cpu(di->id2.i_data.id_count)) return 1; return 0; } -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 20/20] ocfs2: Local alloc window size changeable via mount option
Mainline commit e5b40f9bf0821554db41fd059a403b00fec955fc Author: Sunil Mushran <sunil.mushran@oracle.com> Date: Thu, 20 Dec 2007 14:50:26 -0800 Local alloc is a performance optimiztion in ocfs2 in which a node takes a window of bits from the global bitmap and then uses that for all small local allocations. This window size is fixed to 8MB currently. This patch allows users to specify the window size in MB including disabling it by passing in 0. If the number specified is too large, the fs will use the default value of 8MB. mount -o localalloc=X /dev/sdX /mntpoint Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com> --- fs/ocfs2/localalloc.c | 41 +++++++++++++++++++++++++++++------------ fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/ocfs2_fs.h | 8 ++++++++ fs/ocfs2/suballoc.c | 5 +++-- fs/ocfs2/super.c | 17 +++++++++++++++++ 5 files changed, 58 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index e29328a..84ed678 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -75,18 +75,11 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -/* - * Determine how large our local alloc window should be, in bits. - * - * These values (and the behavior in ocfs2_alloc_should_use_local) have - * been chosen so that most allocations, including new block groups go - * through local alloc. - */ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) { - BUG_ON(osb->s_clustersize_bits < 12); + BUG_ON(osb->s_clustersize_bits > 20); - return 2048 >> (osb->s_clustersize_bits - 12); + return osb->local_alloc_size << (20 - osb->s_clustersize_bits); } /* @@ -96,18 +89,23 @@ static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) { int la_bits = ocfs2_local_alloc_window_bits(osb); + int ret = 0; if (osb->local_alloc_state != OCFS2_LA_ENABLED) - return 0; + goto bail; /* la_bits should be at least twice the size (in clusters) of * a new block group. We want to be sure block group * allocations go through the local alloc, so allow an * allocation to take up to half the bitmap. */ if (bits > (la_bits / 2)) - return 0; + goto bail; - return 1; + ret = 1; +bail: + mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", + osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); + return ret; } int ocfs2_load_local_alloc(struct ocfs2_super *osb) @@ -121,6 +119,19 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); + if (ocfs2_mount_local(osb)) + goto bail; + + if (osb->local_alloc_size == 0) + goto bail; + + if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { + mlog(ML_NOTICE, "Requested local alloc window %d is larger " + "than max possible %u. Using defaults.\n", + ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); + osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + } + /* read the alloc off disk */ inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, osb->slot_num); @@ -181,6 +192,9 @@ bail: if (inode) iput(inode); + mlog(0, "Local alloc window bits = %d\n", + ocfs2_local_alloc_window_bits(osb)); + mlog_exit(status); return status; } @@ -508,6 +522,9 @@ bail: iput(local_alloc_inode); } + mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num, + status); + mlog_exit(status); return status; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 60a23e1..df27b52 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -232,6 +232,7 @@ struct ocfs2_super atomic_t needs_checkpoint; struct ocfs2_journal *journal; + int local_alloc_size; enum ocfs2_local_alloc_state local_alloc_state; struct buffer_head *local_alloc_bh; u64 la_last_gd; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 6ef8767..9687b43 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -256,6 +256,14 @@ struct ocfs2_space_resv { /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) +/* + * Default local alloc size (in megabytes) + * + * The value chosen should be such that most allocations, including new + * block groups, use local alloc. + */ +#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 + struct ocfs2_system_inode_info { char *si_name; int si_iflags; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8f09f52..4add3ee 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1519,8 +1519,9 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, if (min_clusters > (osb->bitmap_cpg - 1)) { /* The only paths asking for contiguousness * should know about this already. */ - mlog(ML_ERROR, "minimum allocation requested exceeds " - "group bitmap size!"); + mlog(ML_ERROR, "minimum allocation requested %u exceeds " + "group bitmap size %u!\n", min_clusters, + osb->bitmap_cpg); status = -ENOSPC; goto bail; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 39cc000..dfb14d1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -87,6 +87,7 @@ struct mount_options unsigned long mount_opt; unsigned int atime_quantum; signed short slot; + unsigned int localalloc_opt; }; static int ocfs2_parse_options(struct super_block *sb, char *options, @@ -154,6 +155,7 @@ enum { Opt_data_writeback, Opt_atime_quantum, Opt_slot, + Opt_localalloc, Opt_err, }; @@ -169,6 +171,7 @@ static match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_atime_quantum, "atime_quantum=%u"}, {Opt_slot, "preferred_slot=%u"}, + {Opt_localalloc, "localalloc=%d"}, {Opt_err, NULL} }; @@ -601,6 +604,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; + osb->local_alloc_size = parsed_options.localalloc_opt; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -754,6 +758,7 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt = 0; mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; mopt->slot = OCFS2_INVALID_SLOT; + mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; if (!options) { status = 1; @@ -820,6 +825,15 @@ static int ocfs2_parse_options(struct super_block *sb, if (option) mopt->slot = (s16)option; break; + case Opt_localalloc: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8)) + mopt->localalloc_opt = option; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -868,6 +882,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); + if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + seq_printf(s, ",localalloc=%d", osb->local_alloc_size); + return 0; } -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 19/20] ocfs2: Re-journal buffers after transaction extend
Mainline commit e8aed3450c0afd6fdb79ec233f806e3e69454dfe Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Mon, 3 Dec 2007 16:43:01 -0800 ocfs2_extend_trans() might call journal_restart() which will commit dirty buffers and then restart the transaction. This means that any buffers which still need changes should be passed to journal_access() again. Some paths during extend weren't doing this right. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/alloc.c | 66 +++++++++++++++++++++++++++++++++++---------------- fs/ocfs2/journal.c | 6 ++++ 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 5714618..cdbb607 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2389,6 +2389,18 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, goto out; } + /* + * Caller might still want to make changes to the + * tree root, so re-add it to the journal here. + */ + ret = ocfs2_journal_access(handle, inode, + path_root_bh(left_path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_rotate_subtree_left(inode, handle, left_path, right_path, subtree_root, dealloc, &deleted); @@ -3289,16 +3301,6 @@ static int ocfs2_insert_path(struct inode *inode, int ret, subtree_index; struct buffer_head *leaf_bh = path_leaf_bh(right_path); - /* - * Pass both paths to the journal. The majority of inserts - * will be touching all components anyway. - */ - ret = ocfs2_journal_access_path(inode, handle, right_path); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - if (left_path) { int credits = handle->h_buffer_credits; @@ -3323,6 +3325,16 @@ static int ocfs2_insert_path(struct inode *inode, } } + /* + * Pass both paths to the journal. The majority of inserts + * will be touching all components anyway. + */ + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + if (insert->ins_split != SPLIT_NONE) { /* * We could call ocfs2_insert_at_leaf() for some types @@ -3331,6 +3343,17 @@ static int ocfs2_insert_path(struct inode *inode, */ ocfs2_split_record(inode, left_path, right_path, insert_rec, insert->ins_split); + + /* + * Split might have modified either leaf and we don't + * have a guarantee that the later edge insert will + * dirty this for us. + */ + if (left_path) + ret = ocfs2_journal_dirty(handle, + path_leaf_bh(left_path)); + if (ret) + mlog_errno(ret); } else ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), insert, inode); @@ -3430,6 +3453,17 @@ static int ocfs2_do_insert_extent(struct inode *inode, mlog_errno(ret); goto out; } + + /* + * ocfs2_rotate_tree_right() might have extended the + * transaction without re-journaling our tree root. + */ + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } } else if (type->ins_appending == APPEND_TAIL && type->ins_contig != CONTIG_LEFT) { ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, @@ -3941,7 +3975,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); - struct buffer_head *eb_bh, *last_eb_bh = NULL; + struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; struct ocfs2_merge_ctxt ctxt; struct ocfs2_extent_list *rightmost_el; @@ -3960,14 +3994,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode, goto out; } - eb_bh = path_leaf_bh(path); - ret = ocfs2_journal_access(handle, inode, eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, split_index, split_rec); @@ -4029,8 +4055,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode, mlog_errno(ret); } - ocfs2_journal_dirty(handle, eb_bh); - out: brelse(last_eb_bh); return ret; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9988abc..051ffd5 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -174,6 +174,12 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, * transaction. extend_trans will either extend the current handle by * nblocks, or commit it and start a new one with nblocks credits. * + * This might call journal_restart() which will commit dirty buffers + * and then restart the transaction. Before calling + * ocfs2_extend_trans(), any changed blocks should have been + * dirtied. After calling it, all blocks which need to be changed must + * go through another set of journal_access/journal_dirty calls. + * * WARNING: This will not release any semaphores or disk locks taken * during the transaction, so make sure they were taken *before* * start_trans or we'll have ordering deadlocks. -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 17/20] ocfs2: Don't panic when truncating an empty extent
Mainline commit 92295d8054289eff0d52b4d12349f9b9df0f58e4 Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Mon, 3 Dec 2007 15:02:10 -0800 This BUG_ON() was unintentionally left in after the sparse file support was written. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/alloc.c | 2 -- 1 files changed, 0 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 1a6455e..5714618 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6092,8 +6092,6 @@ start: mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); - BUG_ON(clusters_to_del == 0); - mutex_lock(&tl_inode->i_mutex); tl_sem = 1; /* ocfs2_truncate_log_needs_flush guarantees us at least one -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] Relevant patches backported from mainline
This is a series of 20 patches backported from mainline. Each patch has the mainline commit for easy cross-checking. Sunil
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 04/20] ocfs2: Commit journal on sync writes
Maineline commit 9ea2d32f40434589ea0e136373f7d1545afb411f Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Thu, 18 Oct 2007 14:14:45 -0700 We're missing a meta data commit for extending sync writes. In thoery, write could return with the meta data required to read the data uncommitted to disk. Fix that by detecting an allocating write and forcing a journal commit in the sync case. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/file.c | 26 +++++++++++++++++++++++++- 1 files changed, 25 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ad73b3c..a6a93b1 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2027,9 +2027,11 @@ static ssize_t __ocfs2_file_aio_write(struct kiocb *iocb, ssize_t written = 0; size_t ocount; /* original count */ size_t count; /* after file limit checks */ - loff_t *ppos = &iocb->ki_pos; + loff_t old_size, *ppos = &iocb->ki_pos; + u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = filp_dentry(file)->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, @@ -2111,6 +2113,13 @@ relock: */ pos = *ppos; + /* + * To later detect whether a journal commit for sync writes is + * necessary, we sample i_size, and cluster count here. + */ + old_size = i_size_read(inode); + old_clusters = OCFS2_I(inode)->ip_clusters; + /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); @@ -2136,6 +2145,21 @@ out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { + /* + * The generic write paths have handled getting data + * to disk, but since we don't make use of the dirty + * inode list, a manual journal commit is necessary + * here. + */ + if (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters) { + ret = journal_force_commit(osb->journal->j_journal); + if (ret < 0) + written = ret; + } + } + /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 13/20] ocfs2: Remove bug statement in ocfs2_dentry_iput()
Mainline commit bccb9dad8949cd0df1d3d2188a1fdb4b1936d537 Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Wed, 7 Nov 2007 16:35:14 -0800 The existing bug statement didn't take into account unhashed dentries which might not have a cluster lock on them. This could happen if a node exporting the file system via NFS is rebooted, re-exported to nfs clients and then unmounted. It's fine in this case to not have a dentry cluster lock. Just remove the bug statement and replace it with an error print, which does the proper checks. Though we want to know if something has happened which might have prevented a cluster lock from being created, it's definitely not necessary to panic the machine for this. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/dcache.c | 20 ++++++++++++++++---- 1 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 9e76e3c..6ec9556 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -344,12 +344,24 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) { struct ocfs2_dentry_lock *dl = dentry->d_fsdata; - mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED), - "dentry: %.*s\n", dentry->d_name.len, - dentry->d_name.name); + if (!dl) { + /* + * No dentry lock is ok if we're disconnected or + * unhashed. + */ + if (!(dentry->d_flags & DCACHE_DISCONNECTED) && + !d_unhashed(dentry)) { + unsigned long long ino = 0ULL; + if (inode) + ino = (unsigned long long)OCFS2_I(inode)->ip_blkno; + mlog(ML_ERROR, "Dentry is missing cluster lock. " + "inode: %llu, d_flags: 0x%x, d_name: %.*s\n", + ino, dentry->d_flags, dentry->d_name.len, + dentry->d_name.name); + } - if (!dl) goto out; + } mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n", dentry->d_name.len, dentry->d_name.name, -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 05/20] ocfs2: fix write() performance regression
Mainline commit 4e9563fd55ff4479f2b118d0757d121dd0cfc39c Author: Mark Fasheh <mark.fasheh@oracle.com> Date: Thu, 1 Nov 2007 11:37:48 -0700 On file systems which don't support sparse files, Ocfs2_map_page_blocks() was reading blocks on appending writes. This caused write performance to suffer dramatically. Fix this by detecting an appending write on a nonsparse fs and skipping the read. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/aops.c | 22 ++++++++++++++++++++++ 1 files changed, 22 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 93c01b7..4999180 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -729,6 +729,27 @@ static void ocfs2_clear_page_regions(struct page *page, } /* + * Nonsparse file systems fully allocate before we get to the write + * code. This prevents ocfs2_write() from tagging the write as an + * allocating one, which means ocfs2_map_page_blocks() might try to + * read-in the blocks at the tail of our file. Avoid reading them by + * testing i_size against each block offset. + */ +static int ocfs2_should_read_blk(struct inode *inode, struct page *page, + unsigned int block_start) +{ + u64 offset = page_offset(page) + block_start; + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + return 1; + + if (i_size_read(inode) > offset) + return 1; + + return 0; +} + +/* * Some of this taken from block_prepare_write(). We already have our * mapping by now though, and the entire write will be allocating or * it won't, so not much need to use BH_New. @@ -781,6 +802,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && + ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 01/20] ocfs2: Fix priority mistakes in fs/ocfs2/{alloc.c, dlmglue.c}
Mainline commit 3cf0c507dd28de0e1a4c24304d806e6b3976f0f5 Author: Roel Kluin <12o3l@tiscali.nl> Date: Sat, 27 Oct 2007 00:20:36 +0200 Fixes priority mistakes similar to '!x & y' Signed-off-by: Roel Kluin <12o3l@tiscali.nl> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/dlmglue.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4791433..1a6455e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3946,7 +3946,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, struct ocfs2_merge_ctxt ctxt; struct ocfs2_extent_list *rightmost_el; - if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) { + if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { ret = -EIO; mlog_errno(ret); goto out; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 41c76ff..ef09fd2 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -670,7 +670,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc { mlog_entry_void(); - BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); if (lockres->l_requested > LKM_NLMODE && -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 12/20] ocfs2: Remove expensive bitmap scanning
Mainline commit 5a58c3ef22d6e5b84ff651a7d27ae2cbea9f9870 Author: Jan Kara <jack@suse.cz> Date: Tue, 13 Nov 2007 19:59:33 +0100 Enable expensive bitmap scanning only if DEBUG option is enabled. The bitmap scanning quite loads the CPU and on my machine the write throughput of dd if=/dev/zero of=/ocfs2/file bs=1M count=500 conv=sync improves from 37 MB/s to 45.4 MB/s in local mode... Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/localalloc.c | 14 +------------- 1 files changed, 1 insertions(+), 13 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index d272847..e29328a 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -484,17 +484,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; - if (le32_to_cpu(alloc->id1.bitmap1.i_used) !- ocfs2_local_alloc_count_bits(alloc)) { - ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u free bits, but a count shows %u", - (unsigned long long)le64_to_cpu(alloc->i_blkno), - le32_to_cpu(alloc->id1.bitmap1.i_used), - ocfs2_local_alloc_count_bits(alloc)); - status = -EIO; - goto bail; - } - free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - le32_to_cpu(alloc->id1.bitmap1.i_used); if (bits_wanted > free_bits) { @@ -712,9 +701,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, void *bitmap; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - mlog_entry("total = %u, COUNT = %u, used = %u\n", + mlog_entry("total = %u, used = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total), - ocfs2_local_alloc_count_bits(alloc), le32_to_cpu(alloc->id1.bitmap1.i_used)); if (!alloc->id1.bitmap1.i_total) { -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 06/20] ocfs2: Fix possibly too long write in o2hb_setup_one_bio()
Mainline commit bc7e97cbdd4bef162e5772c74ee2cc4487a2d997 Author: Jan Kara <jack@suse.cz> Date: Wed, 10 Oct 2007 16:25:42 +0200 We should subtract start of our IO from PAGE_CACHE_SIZE to get the right length of the write we want to perform. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/cluster/heartbeat.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6a91607..999e7e5 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -275,7 +275,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, current_page = cs / spp; page = reg->hr_slot_data[current_page]; - vec_len = min(PAGE_CACHE_SIZE, + vec_len = min(PAGE_CACHE_SIZE - vec_start, (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", -- 1.5.3.4
Sunil Mushran
2008-Jan-16 11:33 UTC
[Ocfs2-devel] [PATCH 09/20] ocfs2: Add missing "space"
Mainline commit 2759236f8415ccc0f275b57bd8142c2c81fd2177 Author: Joe Perches <joe@perches.com> Date: Mon, 19 Nov 2007 17:53:34 -0800 Signed-off-by: Joe Perches <joe@perches.com> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> --- fs/ocfs2/dlm/dlmmaster.c | 4 ++-- fs/ocfs2/inode.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index c8e5d84..42ea5af 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -907,7 +907,7 @@ lookup: * but they might own this lockres. wait on them. */ bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" + mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " "recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; @@ -961,7 +961,7 @@ redo_request: spin_lock(&dlm->spinlock); bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" + mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " "recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 1d5e0cb..9e3e7df 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -863,7 +863,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode, status = ocfs2_try_open_lock(inode, 1); if (status == -EAGAIN) { status = 0; - mlog(0, "Skipping delete of %llu because it is in use on" + mlog(0, "Skipping delete of %llu because it is in use on " "other nodes\n", (unsigned long long)oi->ip_blkno); goto bail; } -- 1.5.3.4