All, Please review the patches backported to 1.4 from mainline. Sunil
Sunil Mushran
2009-Jun-18 01:52 UTC
[Ocfs2-devel] [PATCH 1/8] ocfs2: update comments in masklog.h
From: Coly Li <coly.li at suse.de> Mainline commit 2b53bc7bff17341d8b5ac12115f5c2363638e628 In the mainline ocfs2 code, the interface for masklog is in files under /sys/fs/o2cb/masklog, but the comments in fs/ocfs2/cluster/masklog.h reference the old /proc interface. They are out of date. This patch modifies the comments in cluster/masklog.h, which also provides a bash script example on how to change the log mask bits. Signed-off-by: Coly Li <coly.li at suse.de> Signed-off-by: Joel Becker <joel.becker at oracle.com> --- fs/ocfs2/cluster/masklog.h | 35 +++++++++++++++++------------------ 1 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 4c32d1c..041f40c 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -48,34 +48,33 @@ * only emit the appropriage printk() when the caller passes in a constant * mask, as is almost always the case. * - * All this bitmask nonsense is hidden from the /proc interface so that Joel - * doesn't have an aneurism. Reading the file gives a straight forward - * indication of which bits are on or off: - * ENTRY off - * EXIT off + * All this bitmask nonsense is managed from the files under + * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward + * indication of which bits are allowed (allow) or denied (off/deny). + * ENTRY deny + * EXIT deny * TCP off * MSG off * SOCKET off - * ERROR off - * NOTICE on + * ERROR allow + * NOTICE allow * * Writing changes the state of a given bit and requires a strictly formatted * single write() call: * - * write(fd, "ENTRY on", 8); + * write(fd, "allow", 5); * - * would turn the entry bit on. "1" is also accepted in the place of "on", and - * "off" and "0" behave as expected. + * Echoing allow/deny/off string into the logmask files can flip the bits + * on or off as expected; here is the bash script for example: * - * Some trivial shell can flip all the bits on or off: + * log_mask="/sys/fs/o2cb/log_mask" + * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do + * echo allow >"$log_mask"/"$node" + * done * - * log_mask="/proc/fs/ocfs2_nodemanager/log_mask" - * cat $log_mask | ( - * while read bit status; do - * # $1 is "on" or "off", say - * echo "$bit $1" > $log_mask - * done - * ) + * The debugfs.ocfs2 tool can also flip the bits with the -l option: + * + * debugfs.ocfs2 -l TCP allow */ /* for task_struct */ -- 1.6.0.4
From: Joel Becker <Joel.Becker at oracle.com> Mainline commit a731d12d6ddd1e703770cacb5dfecb155b03ee06 ocfs2 was hand-calling vfs_follow_link(), but there's no point to that. Let's use page_follow_link_light() and nd_set_link(). Signed-off-by: Joel Becker <joel.becker at oracle.com> Signed-off-by: Al Viro <viro at zeniv.linux.org.uk> --- fs/ocfs2/symlink.c | 76 +++++++++++++++++++++++++-------------------------- 1 files changed, 37 insertions(+), 39 deletions(-) diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 3fee102..0dd4b2f 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/utsname.h> +#include <linux/namei.h> #define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> @@ -53,26 +54,6 @@ #include "buffer_head_io.h" -static char *ocfs2_page_getlink(struct dentry * dentry, - struct page **ppage); -static char *ocfs2_fast_symlink_getlink(struct inode *inode, - struct buffer_head **bh); - -/* get the link contents into pagecache */ -static char *ocfs2_page_getlink(struct dentry * dentry, - struct page **ppage) -{ - struct page * page; - struct address_space *mapping = dentry->d_inode->i_mapping; - page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) - goto sync_fail; - *ppage = page; - return kmap(page); - -sync_fail: - return (char*)page; -} static char *ocfs2_fast_symlink_getlink(struct inode *inode, struct buffer_head **bh) @@ -131,35 +112,50 @@ out: return ret; } -static void *ocfs2_follow_link(struct dentry *dentry, - struct nameidata *nd) +static void *ocfs2_fast_follow_link(struct dentry *dentry, + struct nameidata *nd) + { - int status; - char *link; + int status = 0; + int len; + char *target, *link = ERR_PTR(-ENOMEM); struct inode *inode = dentry->d_inode; - struct page *page = NULL; struct buffer_head *bh = NULL; - if (ocfs2_inode_is_fast_symlink(inode)) - link = ocfs2_fast_symlink_getlink(inode, &bh); - else - link = ocfs2_page_getlink(dentry, &page); - if (IS_ERR(link)) { - status = PTR_ERR(link); + mlog_entry_void(); + + BUG_ON(!ocfs2_inode_is_fast_symlink(inode)); + target = ocfs2_fast_symlink_getlink(inode, &bh); + if (IS_ERR(target)) { + status = PTR_ERR(target); mlog_errno(status); goto bail; } - status = vfs_follow_link(nd, link); + /* Fast symlinks can't be large */ + len = strlen(target); + link = kzalloc(len + 1, GFP_NOFS); + if (!link) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + memcpy(link, target, len); + nd_set_link(nd, link); bail: - if (page) { - kunmap(page); - page_cache_release(page); - } brelse(bh); - return ERR_PTR(status); + mlog_exit(status); + return status ? ERR_PTR(status) : link; +} + +static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + char *link = cookie; + + kfree(link); } #ifdef IOP_IS_NOT_CONST @@ -168,7 +164,8 @@ struct inode_operations ocfs2_symlink_inode_operations = { const struct inode_operations ocfs2_symlink_inode_operations = { #endif .readlink = page_readlink, - .follow_link = ocfs2_follow_link, + .follow_link = page_follow_link_light, + .put_link = page_put_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, }; @@ -178,7 +175,8 @@ struct inode_operations ocfs2_fast_symlink_inode_operations = { const struct inode_operations ocfs2_fast_symlink_inode_operations = { #endif .readlink = ocfs2_readlink, - .follow_link = ocfs2_follow_link, + .follow_link = ocfs2_fast_follow_link, + .put_link = ocfs2_fast_put_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, }; -- 1.6.0.4
Sunil Mushran
2009-Jun-18 01:52 UTC
[Ocfs2-devel] [PATCH 3/8] ocfs2: Correct ordering of ip_alloc_sem and localloc locks for directories
From: Jan Kara <jack at suse.cz> Mainline commit edd45c08499a3e9d4c25431cd2b6a9ce5f692c92 We use ordering ip_alloc_sem -> local alloc locks in ocfs2_write_begin(). So change lock ordering in ocfs2_extend_dir() and ocfs2_expand_inline_dir() to also use this lock ordering. Signed-off-by: Jan Kara <jack at suse.cz> Acked-by: Mark Fasheh <mfasheh at suse.com> Signed-off-by: Joel Becker <joel.becker at oracle.com> --- fs/ocfs2/dir.c | 21 ++++++++++----------- 1 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 6389c05..f251c1a 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1197,6 +1197,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, alloc = ocfs2_clusters_for_bytes(sb, bytes); + down_write(&oi->ip_alloc_sem); + /* * We should never need more than 2 clusters for this - * maximum dirent size is far less than one block. In fact, @@ -1213,8 +1215,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out; } - down_write(&oi->ip_alloc_sem); - /* * Prepare for worst case allocation scenario of two separate * extents. @@ -1226,7 +1226,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_sem; + goto out; } /* @@ -1353,10 +1353,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, out_commit: ocfs2_commit_trans(osb, handle); -out_sem: - up_write(&oi->ip_alloc_sem); - out: + up_write(&oi->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); @@ -1468,11 +1466,15 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, brelse(new_bh); new_bh = NULL; + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; dir_i_size = i_size_read(dir); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; goto do_extend; } + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; dir_i_size = i_size_read(dir); mlog(0, "extending dir %llu (i_size = %lld)\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); @@ -1511,9 +1513,6 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, } do_extend: - down_write(&OCFS2_I(dir)->ip_alloc_sem); - drop_alloc_sem = 1; - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -1560,10 +1559,10 @@ bail_bh: *new_de_bh = new_bh; get_bh(*new_de_bh); bail: - if (drop_alloc_sem) - up_write(&OCFS2_I(dir)->ip_alloc_sem); if (handle) ocfs2_commit_trans(osb, handle); + if (drop_alloc_sem) + up_write(&OCFS2_I(dir)->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); -- 1.6.0.4
Sunil Mushran
2009-Jun-18 01:52 UTC
[Ocfs2-devel] [PATCH 4/8] ocfs2: Timer to queue scan of all orphan
From: Srinivas Eeda <srinivas.eeda at oracle.com> Mainline commit bd1fda0725338a1a831cf763c2cacccd83d2776e When a dentry is unlinked, the unlinking node takes an EX on the dentry lock before moving the dentry to the orphan directory. Other nodes that have this dentry in cache have a PR on the same dentry lock. When the EX is requested, the other nodes flag the corresponding inode as MAYBE_ORPHANED during downconvert. The inode is finally deleted when the last node to iput the inode sees that i_nlink==0 and the MAYBE_ORPHANED flag is set. A problem arises if a node is forced to free dentry locks because of memory pressure. If this happens, the node will no longer get downconvert notifications for the dentries that have been unlinked on another node. If it also happens that node is actively using the corresponding inode and happens to be the one performing the last iput on that inode, it will fail to delete the inode as it will not have the MAYBE_ORPHANED flag set. This patch fixes this shortcoming by introducing a periodic scan of the orphan directories to delete such inodes. Care has been taken to distribute the workload across the cluster so that no one node has to perform the task all the time. Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com> Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com> --- fs/ocfs2/dlmglue.c | 51 ++++++++++++++++++++++ fs/ocfs2/dlmglue.h | 10 ++++ fs/ocfs2/journal.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/journal.h | 4 ++ fs/ocfs2/ocfs2.h | 9 ++++ fs/ocfs2/ocfs2_lockid.h | 5 ++ fs/ocfs2/super.c | 9 ++++ 7 files changed, 195 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index bacb092..72463d8 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -246,6 +246,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, @@ -632,6 +636,19 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_rename_lops, osb); } +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan_lvb *lvb; + + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN, + &ocfs2_orphan_scan_lops, osb); + lvb = (struct ocfs2_orphan_scan_lvb *)res->l_lksb.lvb; + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; +} + void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp) { @@ -2212,6 +2229,37 @@ void ocfs2_inode_unlock(struct inode *inode, mlog_exit_void(); } +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int level = ex ? LKM_EXMODE : LKM_PRMODE; + int status = 0; + + lockres = &osb->osb_orphan_scan.os_lockres; + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + return status; + + lvb = (struct ocfs2_orphan_scan_lvb *)lockres->l_lksb.lvb; + if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION) + *seqno = be32_to_cpu(lvb->lvb_os_seqno); + return status; +} + +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int level = ex ? LKM_EXMODE : LKM_PRMODE; + + lockres = &osb->osb_orphan_scan.os_lockres; + lvb = (struct ocfs2_orphan_scan_lvb *)lockres->l_lksb.lvb; + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; + lvb->lvb_os_seqno = cpu_to_be32(seqno); + ocfs2_cluster_unlock(osb, lockres, level); +} + int ocfs2_super_lock(struct ocfs2_super *osb, int ex) { @@ -2676,6 +2724,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->dlm = dlm; @@ -2706,6 +2755,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb) ocfs2_lock_res_free(&osb->osb_super_lockres); ocfs2_lock_res_free(&osb->osb_rename_lockres); + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres); dlm_unregister_domain(osb->dlm); osb->dlm = NULL; @@ -2904,6 +2954,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) { ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres); } int ocfs2_drop_inode_locks(struct inode *inode) diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index e3cf902..a197c09 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -49,6 +49,14 @@ struct ocfs2_meta_lvb { __be32 lvb_reserved2; }; +#define OCFS2_ORPHAN_LVB_VERSION 1 + +struct ocfs2_orphan_scan_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_os_seqno; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -97,6 +105,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb, int ex); void ocfs2_super_unlock(struct ocfs2_super *osb, int ex); +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex); +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex); int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_dentry_lock(struct dentry *dentry, int ex); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 484ccb5..144d492 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -28,6 +28,8 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/time.h> +#include <linux/random.h> #define MLOG_MASK_PREFIX ML_JOURNAL #include <cluster/masklog.h> @@ -50,6 +52,8 @@ DEFINE_SPINLOCK(trans_inc_lock); +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000 + static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, int node_num); @@ -1440,6 +1444,109 @@ bail: return status; } +/* + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some + * randomness to the timeout to minimize multple nodes firing the timer at the + * same time. + */ +static inline unsigned long ocfs2_orphan_scan_timeout(void) +{ + unsigned long time; + + get_random_bytes(&time, sizeof(time)); + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000); + return msecs_to_jiffies(time); +} + +/* + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for + * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This + * is done to catch any orphans that are left over in orphan directories. + * + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT + * seconds. It gets an EX lock on os_lockres and checks sequence number + * stored in LVB. If the sequence number has changed, it means some other + * node has done the scan. This node skips the scan and tracks the + * sequence number. If the sequence number didn't change, it means a scan + * hasn't happened. The node queues a scan and increments the + * sequence number in the LVB. + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + int status, i; + u32 seqno = 0; + + os = &osb->osb_orphan_scan; + + status = ocfs2_orphan_scan_lock(osb, &seqno, LKM_EXMODE); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto out; + } + + if (os->os_seqno != seqno) { + os->os_seqno = seqno; + goto unlock; + } + + for (i = 0; i < osb->max_slots; i++) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL); + + /* + * We queued a recovery on orphan slots, increment the sequence + * number and update LVB so other node will skip the scan for a while + */ + seqno++; +unlock: + ocfs2_orphan_scan_unlock(osb, seqno, LKM_EXMODE); +out: + return; +} + +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ +void ocfs2_orphan_scan_work(kapi_work_struct_t *work) +{ + struct ocfs2_orphan_scan *os; + struct ocfs2_super *osb; + + os = work_to_object(work, struct ocfs2_orphan_scan, + os_orphan_scan_work.work); + osb = os->os_osb; + + mutex_lock(&os->os_lock); + ocfs2_queue_orphan_scan(osb); + schedule_delayed_work(&os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + mutex_unlock(&os->os_lock); +} + +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + mutex_lock(&os->os_lock); + cancel_delayed_work(&os->os_orphan_scan_work); + mutex_unlock(&os->os_lock); +} + +int ocfs2_orphan_scan_init(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_osb = osb; + mutex_init(&os->os_lock); + + KAPI_INIT_DELAYED_WORK(&os->os_orphan_scan_work, + ocfs2_orphan_scan_work, os); + schedule_delayed_work(&os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + return 0; +} + struct ocfs2_orphan_filldir_priv { struct inode *head; struct ocfs2_super *osb; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index df8e9de..fa18a84 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -133,6 +133,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, } /* Exported only for the journal struct init code in super.c. Do not call. */ +int ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); + void ocfs2_complete_recovery(kapi_work_struct_t *work); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index e84185d..7989966 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -145,6 +145,14 @@ struct ocfs2_lock_res { #endif }; +struct ocfs2_orphan_scan { + struct mutex os_lock; + struct ocfs2_super *os_osb; + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ + struct delayed_work os_orphan_scan_work; + u32 os_seqno; /* incremented on every scan */ +}; + struct ocfs2_dlm_debug { struct kref d_refcnt; struct dentry *d_locking_state; @@ -319,6 +327,7 @@ struct ocfs2_super struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; + struct ocfs2_orphan_scan osb_orphan_scan; /* the group we used to allocate inodes. */ u64 osb_inode_alloc_group; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 86f3e37..89e2645 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -46,6 +46,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_DENTRY, OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, + OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_NUM_LOCK_TYPES }; @@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_FLOCK: c = 'F'; break; + case OCFS2_LOCK_TYPE_ORPHAN_SCAN: + c = 'P'; + break; default: c = '\0'; } @@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a421e7d..cd66b4d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1495,6 +1495,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_truncate_log_shutdown(osb); + ocfs2_orphan_scan_stop(osb); + /* disable any new recovery threads and wait for any currently * running ones to exit. Do this before setting the vol_state. */ mutex_lock(&osb->recovery_lock); @@ -1640,6 +1642,13 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->disable_recovery = 0; osb->recovery_thread_task = NULL; + status = ocfs2_orphan_scan_init(osb); + if (status) { + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n"); + mlog_errno(status); + goto bail; + } + init_waitqueue_head(&osb->checkpoint_event); atomic_set(&osb->needs_checkpoint, 0); -- 1.6.0.4
Sunil Mushran
2009-Jun-18 01:52 UTC
[Ocfs2-devel] [PATCH 5/8] ocfs2: Track delayed orphan scan
From: Srinivas Eeda <srinivas.eeda at oracle.com> Mainline commit f6080a698251629e8152554f5d1e33ec3ffe2a3d Patch to track delayed orphan scan timer statistics. Modifies ocfs2_osb_dump to print the following: Orphan Scan=> Local: 10 Global: 21 Last Scan: 67 seconds ago Signed-off-by: Srinivas Eeda <srinivas.eeda at oracle.com> Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com> --- fs/ocfs2/journal.c | 4 ++++ fs/ocfs2/ocfs2.h | 4 +++- fs/ocfs2/super.c | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 144d492..c0acffa 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1499,6 +1499,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) * number and update LVB so other node will skip the scan for a while */ seqno++; + os->os_count++; + os->os_scantime = CURRENT_TIME; unlock: ocfs2_orphan_scan_unlock(osb, seqno, LKM_EXMODE); out: @@ -1538,6 +1540,8 @@ int ocfs2_orphan_scan_init(struct ocfs2_super *osb) os = &osb->osb_orphan_scan; os->os_osb = osb; + os->os_count = 0; + os->os_scantime = CURRENT_TIME; mutex_init(&os->os_lock); KAPI_INIT_DELAYED_WORK(&os->os_orphan_scan_work, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7989966..9ee656d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -150,7 +150,9 @@ struct ocfs2_orphan_scan { struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; - u32 os_seqno; /* incremented on every scan */ + struct timespec os_scantime; /* time this node ran the scan */ + u32 os_count; /* tracks node specific scans */ + u32 os_seqno; /* tracks cluster wide scans */ }; struct ocfs2_dlm_debug { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index cd66b4d..374063a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -198,6 +198,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) { int out = 0; int i; + struct ocfs2_orphan_scan *os; out += snprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", @@ -290,6 +291,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) i, osb->slot_recovery_generations[i]); } + os = &osb->osb_orphan_scan; + out += snprintf(buf + out, len - out, "Orphan Scan=> "); + out += snprintf(buf + out, len - out, "Local: %u Global: %u ", + os->os_count, os->os_seqno); + out += snprintf(buf + out, len - out, " Last Scan: %lu seconds ago\n", + (get_seconds() - os->os_scantime.tv_sec)); return out; } -- 1.6.0.4
Sunil Mushran
2009-Jun-18 01:52 UTC
[Ocfs2-devel] [PATCH 6/8] ocfs2: Remove redundant gotos in ocfs2_mount_volume()
From: Tao Ma <tao.ma at oracle.com> Mainline commit 06c59bb896ce23c56829f6acf667faebd51dd3b8 Signed-off-by: Tao Ma <tao.ma at oracle.com> Signed-off-by: Joel Becker <joel.becker at oracle.com> --- fs/ocfs2/super.c | 7 +------ 1 files changed, 1 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 374063a..d1cd7df 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1459,13 +1459,8 @@ static int ocfs2_mount_volume(struct super_block *sb) } status = ocfs2_truncate_log_init(osb); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto leave; - } - - if (ocfs2_mount_local(osb)) - goto leave; leave: if (unlock_super) -- 1.6.0.4
Sunil Mushran
2009-Jun-18 01:52 UTC
[Ocfs2-devel] [PATCH 7/8] ocfs2: fdatasync should skip unimportant metadata writeout
From: Hisashi Hifumi <hifumi.hisashi at oss.ntt.co.jp> Mainline commit e04cc15f52eb072937cec2bbd8499f37afe5e1ef In ocfs2, fdatasync and fsync are identical. I think fdatasync should skip committing transaction when inode->i_state is set just I_DIRTY_SYNC and this indicates only atime or/and mtime updates. Following patch improves fdatasync throughput. #sysbench --num-threads=16 --max-requests=300000 --test=fileio --file-block-size=4K --file-total-size=16G --file-test-mode=rndwr --file-fsync-mode=fdatasync run Results: -2.6.30-rc8 Test execution summary: total time: 107.1445s total number of events: 119559 total time taken by event execution: 116.1050 per-request statistics: min: 0.0000s avg: 0.0010s max: 0.1220s approx. 95 percentile: 0.0016s Threads fairness: events (avg/stddev): 7472.4375/303.60 execution time (avg/stddev): 7.2566/0.64 -2.6.30-rc8-patched Test execution summary: total time: 86.8529s total number of events: 300016 total time taken by event execution: 24.3077 per-request statistics: min: 0.0000s avg: 0.0001s max: 0.0336s approx. 95 percentile: 0.0001s Threads fairness: events (avg/stddev): 18751.0000/718.75 execution time (avg/stddev): 1.5192/0.05 Signed-off-by: Hisashi Hifumi <hifumi.hisashi at oss.ntt.co.jp> Acked-by: Mark Fasheh <mfasheh at suse.com> Signed-off-by: Joel Becker <joel.becker at oracle.com> --- fs/ocfs2/file.c | 3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 267389c..1204656 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -189,6 +189,9 @@ static int ocfs2_sync_file(struct file *file, if (err) goto bail; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto bail; + journal = osb->journal->j_journal; err = journal_force_commit(journal); -- 1.6.0.4
Sunil Mushran
2009-Jun-18 01:52 UTC
[Ocfs2-devel] [PATCH 8/8] ocfs2/net: Use wait_event() in o2net_send_message_vec()
Mainline commit 9af0b38ff3f4f79c62dd909405b113bf7c1a23aa Replace wait_event_interruptible() with wait_event() in o2net_send_message_vec(). This is because this function is called by the dlm that expects signals to be blocked. Fixes oss bugzilla#1126 http://oss.oracle.com/bugzilla/show_bug.cgi?id=1126 Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com> Signed-off-by: Joel Becker <joel.becker at oracle.com> --- fs/ocfs2/cluster/tcp.c | 7 ++----- 1 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 7b24751..7ea88f7 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -982,7 +982,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, size_t caller_veclen, u8 target_node, int *status) { - int ret, error = 0; + int ret; struct o2net_msg *msg = NULL; size_t veclen, caller_bytes = 0; struct kvec *vec = NULL; @@ -1023,10 +1023,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, o2net_set_nst_sock_time(&nst); - ret = wait_event_interruptible(nn->nn_sc_wq, - o2net_tx_can_proceed(nn, &sc, &error)); - if (!ret && error) - ret = error; + wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret)); if (ret) goto out; -- 1.6.0.4