Joel Becker
2007-Dec-06 21:21 UTC
[Ocfs2-devel] [PATCH 1/7] ocfs2: Move slot map access into slot_map.c
From: Mark Fasheh <mark.fasheh@oracle.com> journal.c and dlmglue.c would refresh the slot map by hand. Instead, have the update and clear functions do the work inside slot_map.c. The eventual result is to make ocfs2_slot_info defined privately in slot_map.c Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmglue.c | 8 +----- fs/ocfs2/journal.c | 3 +- fs/ocfs2/slot_map.c | 62 +++++++++++++++++++++++++++++++++++++++----------- fs/ocfs2/slot_map.h | 11 +++----- fs/ocfs2/super.c | 3 +- 5 files changed, 55 insertions(+), 32 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 64d6ff4..c2ebd72 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1935,8 +1935,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, int status = 0; int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; - struct buffer_head *bh; - struct ocfs2_slot_info *si = osb->slot_info; mlog_entry_void(); @@ -1962,11 +1960,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, goto bail; } if (status) { - bh = si->si_bh; - status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, - si->si_inode); - if (status == 0) - ocfs2_update_slot_info(si); + status = ocfs2_refresh_slot_info(osb); ocfs2_complete_lock_res_refresh(lockres, status); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b9dd370..da63375 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1109,8 +1109,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* Likewise, this would be a strange but ultimately not so * harmful place to get an error... */ - ocfs2_clear_slot(si, slot_num); - status = ocfs2_update_disk_slots(osb, si); + status = ocfs2_clear_slot(osb, slot_num); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 3a50ce5..f5727b8 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -49,7 +49,7 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, s16 node_num); /* post the slot information on disk into our slot_info struct. */ -void ocfs2_update_slot_info(struct ocfs2_slot_info *si) +static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) { int i; __le16 *disk_info; @@ -65,10 +65,27 @@ void ocfs2_update_slot_info(struct ocfs2_slot_info *si) spin_unlock(&si->si_lock); } +int ocfs2_refresh_slot_info(struct ocfs2_super *osb) +{ + int ret; + struct ocfs2_slot_info *si = osb->slot_info; + struct buffer_head *bh; + + if (si == NULL) + return 0; + + bh = si->si_bh; + ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode); + if (ret == 0) + ocfs2_update_slot_info(si); + + return ret; +} + /* post the our slot info stuff into it's destination bh and write it * out. */ -int ocfs2_update_disk_slots(struct ocfs2_super *osb, - struct ocfs2_slot_info *si) +static int ocfs2_update_disk_slots(struct ocfs2_super *osb, + struct ocfs2_slot_info *si) { int status, i; __le16 *disk_info = (__le16 *) si->si_bh->b_data; @@ -135,6 +152,19 @@ s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, return ret; } +static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) +{ + if (si == NULL) + return; + + if (si->si_inode) + iput(si->si_inode); + if (si->si_bh) + brelse(si->si_bh); + + kfree(si); +} + static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, s16 slot_num, s16 node_num) @@ -147,12 +177,18 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, si->si_global_node_nums[slot_num] = node_num; } -void ocfs2_clear_slot(struct ocfs2_slot_info *si, - s16 slot_num) +int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) { + struct ocfs2_slot_info *si = osb->slot_info; + + if (si == NULL) + return 0; + spin_lock(&si->si_lock); __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); spin_unlock(&si->si_lock); + + return ocfs2_update_disk_slots(osb, osb->slot_info); } int ocfs2_init_slot_info(struct ocfs2_super *osb) @@ -202,18 +238,17 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) osb->slot_info = si; bail: if (status < 0 && si) - ocfs2_free_slot_info(si); + __ocfs2_free_slot_info(si); return status; } -void ocfs2_free_slot_info(struct ocfs2_slot_info *si) +void ocfs2_free_slot_info(struct ocfs2_super *osb) { - if (si->si_inode) - iput(si->si_inode); - if (si->si_bh) - brelse(si->si_bh); - kfree(si); + struct ocfs2_slot_info *si = osb->slot_info; + + osb->slot_info = NULL; + __ocfs2_free_slot_info(si); } int ocfs2_find_slot(struct ocfs2_super *osb) @@ -285,7 +320,6 @@ void ocfs2_put_slot(struct ocfs2_super *osb) } bail: - osb->slot_info = NULL; - ocfs2_free_slot_info(si); + ocfs2_free_slot_info(osb); } diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index 1025872..b029ffd 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -30,7 +30,7 @@ struct ocfs2_slot_info { spinlock_t si_lock; - struct inode *si_inode; + struct inode *si_inode; struct buffer_head *si_bh; unsigned int si_num_slots; unsigned int si_size; @@ -38,19 +38,16 @@ struct ocfs2_slot_info { }; int ocfs2_init_slot_info(struct ocfs2_super *osb); -void ocfs2_free_slot_info(struct ocfs2_slot_info *si); +void ocfs2_free_slot_info(struct ocfs2_super *osb); int ocfs2_find_slot(struct ocfs2_super *osb); void ocfs2_put_slot(struct ocfs2_super *osb); -void ocfs2_update_slot_info(struct ocfs2_slot_info *si); -int ocfs2_update_disk_slots(struct ocfs2_super *osb, - struct ocfs2_slot_info *si); +int ocfs2_refresh_slot_info(struct ocfs2_super *osb); s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, s16 global); -void ocfs2_clear_slot(struct ocfs2_slot_info *si, - s16 slot_num); +int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num); static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, int slot_num) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1996820..012b555 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1682,8 +1682,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) /* This function assumes that the caller has the main osb resource */ - if (osb->slot_info) - ocfs2_free_slot_info(osb->slot_info); + ocfs2_free_slot_info(osb); kfree(osb->osb_orphan_wipes); /* FIXME -- 1.5.2.2
Joel Becker
2007-Dec-06 21:21 UTC
[Ocfs2-devel] [PATCH 3/7] ocfs2: Change the recovery map to an array of node numbers.
The old recovery map was a bitmap of node numbers. This was sufficient for the maximum node number of 254. Going forward, we want node numbers to be UINT32. Thus, we need a new recovery map. Note that we can't keep track of slots here. We must write down the node number to recovery *before* we get the locks needed to convert a node number into a slot number. The recovery map is now an array of unsigned ints, max_slots in size. It moves to journal.c with the rest of recovery. Because it needs to be initialized, we move all of recovery initialization into a new function, ocfs2_recovery_init(). This actually cleans up ocfs2_initialize_super() a little as well. Following on, recovery cleaup becomes part of ocfs2_recovery_exit(). A number of node map functions are rendered obsolete and are removed. Finally, waiting on recovery is wrapped in a function rather than naked checks on the recovery_event. This is a cleanup from Mark. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmglue.c | 6 +- fs/ocfs2/heartbeat.c | 111 ---------------------------- fs/ocfs2/heartbeat.h | 18 ----- fs/ocfs2/journal.c | 195 ++++++++++++++++++++++++++++++++++++++++++++++---- fs/ocfs2/journal.h | 4 + fs/ocfs2/ocfs2.h | 3 +- fs/ocfs2/super.c | 33 ++------- 7 files changed, 196 insertions(+), 174 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c2ebd72..828f1dd 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1753,8 +1753,7 @@ int ocfs2_meta_lock_full(struct inode *inode, goto local; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) - wait_event(osb->recovery_event, - ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + ocfs2_wait_for_recovery(osb); lockres = &OCFS2_I(inode)->ip_meta_lockres; level = ex ? LKM_EXMODE : LKM_PRMODE; @@ -1777,8 +1776,7 @@ int ocfs2_meta_lock_full(struct inode *inode, * committed to owning this lock so we don't allow signals to * abort the operation. */ if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) - wait_event(osb->recovery_event, - ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + ocfs2_wait_for_recovery(osb); local: /* diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index c0efd94..8e3eac8 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -48,16 +48,10 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, int bit); static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, int bit); -static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); -static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, - struct ocfs2_node_map *from); -static void __ocfs2_node_map_set(struct ocfs2_node_map *target, - struct ocfs2_node_map *from); void ocfs2_init_node_maps(struct ocfs2_super *osb) { spin_lock_init(&osb->node_map_lock); - ocfs2_node_map_init(&osb->recovery_map); ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); } @@ -196,108 +190,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb, return ret; } -static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) -{ - int bit; - bit = find_next_bit(map->map, map->num_nodes, 0); - if (bit < map->num_nodes) - return 0; - return 1; -} - -int ocfs2_node_map_is_empty(struct ocfs2_super *osb, - struct ocfs2_node_map *map) -{ - int ret; - BUG_ON(map->num_nodes == 0); - spin_lock(&osb->node_map_lock); - ret = __ocfs2_node_map_is_empty(map); - spin_unlock(&osb->node_map_lock); - return ret; -} - -static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, - struct ocfs2_node_map *from) -{ - BUG_ON(from->num_nodes == 0); - ocfs2_node_map_init(target); - __ocfs2_node_map_set(target, from); -} - -/* returns 1 if bit is the only bit set in target, 0 otherwise */ -int ocfs2_node_map_is_only(struct ocfs2_super *osb, - struct ocfs2_node_map *target, - int bit) -{ - struct ocfs2_node_map temp; - int ret; - - spin_lock(&osb->node_map_lock); - __ocfs2_node_map_dup(&temp, target); - __ocfs2_node_map_clear_bit(&temp, bit); - ret = __ocfs2_node_map_is_empty(&temp); - spin_unlock(&osb->node_map_lock); - - return ret; -} - -static void __ocfs2_node_map_set(struct ocfs2_node_map *target, - struct ocfs2_node_map *from) -{ - int num_longs, i; - - BUG_ON(target->num_nodes != from->num_nodes); - BUG_ON(target->num_nodes == 0); - - num_longs = BITS_TO_LONGS(target->num_nodes); - for (i = 0; i < num_longs; i++) - target->map[i] = from->map[i]; -} - -/* Returns whether the recovery bit was actually set - it may not be - * if a node is still marked as needing recovery */ -int ocfs2_recovery_map_set(struct ocfs2_super *osb, - int num) -{ - int set = 0; - - spin_lock(&osb->node_map_lock); - - if (!test_bit(num, osb->recovery_map.map)) { - __ocfs2_node_map_set_bit(&osb->recovery_map, num); - set = 1; - } - - spin_unlock(&osb->node_map_lock); - - return set; -} - -void ocfs2_recovery_map_clear(struct ocfs2_super *osb, - int num) -{ - ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); -} - -int ocfs2_node_map_iterate(struct ocfs2_super *osb, - struct ocfs2_node_map *map, - int idx) -{ - int i = idx; - - idx = O2NM_INVALID_NODE_NUM; - spin_lock(&osb->node_map_lock); - if ((i != O2NM_INVALID_NODE_NUM) && - (i >= 0) && - (i < map->num_nodes)) { - while(i < map->num_nodes) { - if (test_bit(i, map->map)) { - idx = i; - break; - } - i++; - } - } - spin_unlock(&osb->node_map_lock); - return idx; -} diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index 5685921..2d65f1c 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h @@ -34,8 +34,6 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb); /* node map functions - used to keep track of mounted and in-recovery * nodes. */ void ocfs2_node_map_init(struct ocfs2_node_map *map); -int ocfs2_node_map_is_empty(struct ocfs2_super *osb, - struct ocfs2_node_map *map); void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit); @@ -45,21 +43,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, int ocfs2_node_map_test_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit); -int ocfs2_node_map_iterate(struct ocfs2_super *osb, - struct ocfs2_node_map *map, - int idx); -static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, - struct ocfs2_node_map *map) -{ - return ocfs2_node_map_iterate(osb, map, 0); -} -int ocfs2_recovery_map_set(struct ocfs2_super *osb, - int num); -void ocfs2_recovery_map_clear(struct ocfs2_super *osb, - int num); -/* returns 1 if bit is the only bit set in target, 0 otherwise */ -int ocfs2_node_map_is_only(struct ocfs2_super *osb, - struct ocfs2_node_map *target, - int bit); #endif /* OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index c1d692b..89c275e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -64,6 +64,151 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, int slot); static int ocfs2_commit_thread(void *arg); + +/* + * The recovery_list is a simple linked list of node numbers to recover. + * It is protected by the recovery_lock. + */ + +struct ocfs2_recovery_map { + int rm_used; + unsigned int *rm_entries; +}; + +int ocfs2_recovery_init(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + mutex_init(&osb->recovery_lock); + osb->disable_recovery = 0; + osb->recovery_thread_task = NULL; + init_waitqueue_head(&osb->recovery_event); + + rm = kzalloc(sizeof(struct ocfs2_recovery_map) + + osb->max_slots * sizeof(unsigned int), + GFP_KERNEL); + if (!rm) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + rm->rm_entries = (unsigned int *)((char *)rm + sizeof(struct ocfs2_recovery_map)); + osb->recovery_map = rm; + + return 0; +} + +/* we can't grab the goofy sem lock from inside wait_event, so we use + * memory barriers to make sure that we'll see the null task before + * being woken up */ +static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) +{ + mb(); + return osb->recovery_thread_task != NULL; +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + mutex_lock(&osb->recovery_lock); + osb->disable_recovery = 1; + mutex_unlock(&osb->recovery_lock); + wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); + + /* At this point, we know that no more recovery threads can be + * launched, so wait for any recovery completion work to + * complete. */ + flush_workqueue(ocfs2_wq); + + /* + * Now that recovery is shut down, and the osb is about to be + * freed, the osb_lock is not taken here. + */ + rm = osb->recovery_map; + /* XXX: Should we bug if there are dirty entries? */ + + if (rm) + kfree(rm); +} + + +/* Behaves like test-and-set. Returns the previous value */ +static int __ocfs2_recovery_map_test(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + assert_spin_locked(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) { + return 1; + } + } + + return 0; +} + +static int ocfs2_recovery_map_test(struct ocfs2_super *osb, + unsigned int node_num) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = __ocfs2_recovery_map_test(osb, node_num); + spin_unlock(&osb->osb_lock); + + return ret; +} + +static int ocfs2_recovery_map_set(struct ocfs2_super *osb, + unsigned int node_num) +{ + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); + return 1; + } + + /* XXX: Can this be exploited? Not from o2dlm... */ + BUG_ON(rm->rm_used >= osb->max_slots); + + rm->rm_entries[rm->rm_used] = node_num; + rm->rm_used++; + spin_unlock(&osb->osb_lock); + + return 0; +} + +static void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + break; + } + + if (i < rm->rm_used) { + /* XXX: be careful with the pointer math */ + memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]), + (rm->rm_used - i - 1) * sizeof(unsigned int)); + rm->rm_used--; + } + + spin_unlock(&osb->osb_lock); +} + static int ocfs2_commit_cache(struct ocfs2_super *osb) { int status = 0; @@ -636,6 +781,23 @@ bail: return status; } +static int ocfs2_recovery_completed(struct ocfs2_super *osb) +{ + int empty; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + empty = (rm->rm_used == 0); + spin_unlock(&osb->osb_lock); + + return empty; +} + +void ocfs2_wait_for_recovery(struct ocfs2_super *osb) +{ + wait_event(osb->recovery_event, ocfs2_recovery_completed(osb)); +} + /* * JBD Might read a cached version of another nodes journal file. We * don't want this as this file changes often and we get no @@ -834,6 +996,7 @@ static int __ocfs2_recovery_thread(void *arg) { int status, node_num; struct ocfs2_super *osb = arg; + struct ocfs2_recovery_map *rm = osb->recovery_map; mlog_entry_void(); @@ -849,26 +1012,29 @@ restart: goto bail; } - while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { - node_num = ocfs2_node_map_first_set_bit(osb, - &osb->recovery_map); - if (node_num == O2NM_INVALID_NODE_NUM) { - mlog(0, "Out of nodes to recover.\n"); - break; - } + spin_lock(&osb->osb_lock); + while (rm->rm_used) { + /* It's always safe to remove entry zero, as we won't + * clear it until ocfs2_recover_node() has succeeded. */ + node_num = rm->rm_entries[0]; + spin_unlock(&osb->osb_lock); status = ocfs2_recover_node(osb, node_num); - if (status < 0) { + if (!status) { + ocfs2_recovery_map_clear(osb, node_num); + } else { mlog(ML_ERROR, "Error %d recovering node %d on device (%u,%u)!\n", status, node_num, MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); mlog(ML_ERROR, "Volume requires unmount.\n"); - continue; } - ocfs2_recovery_map_clear(osb, node_num); + spin_lock(&osb->osb_lock); } + spin_unlock(&osb->osb_lock); + mlog(0, "All nodes recovered\n"); + ocfs2_super_unlock(osb, 1); /* We always run recovery on our own orphan dir - the dead @@ -879,8 +1045,7 @@ restart: bail: mutex_lock(&osb->recovery_lock); - if (!status && - !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { + if (!status && !ocfs2_recovery_completed(osb)) { mutex_unlock(&osb->recovery_lock); goto restart; } @@ -910,8 +1075,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) /* People waiting on recovery will wait on * the recovery map to empty. */ - if (!ocfs2_recovery_map_set(osb, node_num)) - mlog(0, "node %d already be in recovery.\n", node_num); + if (ocfs2_recovery_map_set(osb, node_num)) + mlog(0, "node %d already in recovery map.\n", node_num); mlog(0, "starting recovery thread...\n"); @@ -1184,7 +1349,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) if (status == -ENOENT) continue; - if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) + if (ocfs2_recovery_map_test(osb, node_num)) continue; spin_unlock(&si->si_lock); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 4b32e09..11008a2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_complete_recovery(struct work_struct *work); +void ocfs2_wait_for_recovery(struct ocfs2_super *osb); + +int ocfs2_recovery_init(struct ocfs2_super *osb); +void ocfs2_recovery_exit(struct ocfs2_super *osb); /* * Journal Control: diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index f8f8661..dcb9120 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -178,6 +178,7 @@ enum ocfs2_mount_options #define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; +struct ocfs2_recovery_map; struct ocfs2_super { struct task_struct *commit_task; @@ -189,7 +190,6 @@ struct ocfs2_super struct ocfs2_slot_info *slot_info; spinlock_t node_map_lock; - struct ocfs2_node_map recovery_map; u64 root_blkno; u64 system_dir_blkno; @@ -224,6 +224,7 @@ struct ocfs2_super atomic_t vol_state; struct mutex recovery_lock; + struct ocfs2_recovery_map *recovery_map; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 012b555..2142985 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1166,15 +1166,6 @@ leave: return status; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ -static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) -{ - mb(); - return osb->recovery_thread_task != NULL; -} - static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) { int tmp; @@ -1191,17 +1182,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_truncate_log_shutdown(osb); - /* disable any new recovery threads and wait for any currently - * running ones to exit. Do this before setting the vol_state. */ - mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; - mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ - flush_workqueue(ocfs2_wq); + /* This will disable recovery and flush any recovery work. */ + ocfs2_recovery_exit(osb); ocfs2_journal_shutdown(osb); @@ -1310,7 +1292,6 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); - init_waitqueue_head(&osb->recovery_event); spin_lock_init(&osb->dc_task_lock); init_waitqueue_head(&osb->dc_event); osb->dc_work_sequence = 0; @@ -1330,10 +1311,12 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); - mutex_init(&osb->recovery_lock); - - osb->disable_recovery = 0; - osb->recovery_thread_task = NULL; + status = ocfs2_recovery_init(osb); + if (status) { + mlog(ML_ERROR, "Unable to initialize recovery state\n"); + mlog_errno(status); + goto bail; + } init_waitqueue_head(&osb->checkpoint_event); atomic_set(&osb->needs_checkpoint, 0); -- 1.5.2.2
Joel Becker
2007-Dec-06 21:21 UTC
[Ocfs2-devel] [PATCH 4/7] ocfs2: slot_map I/O based on i_size.
The slot map code assumed a slot_map file has one block allocated. This changes the code to I/O as many blocks as will cover i_size. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/slot_map.c | 128 +++++++++++++++++++++++++++++++++++++++++++-------- 1 files changed, 108 insertions(+), 20 deletions(-) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index a4f2c02..fd08592 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -46,7 +46,8 @@ struct ocfs2_slot_info_real { spinlock_t si_lock; struct inode *si_inode; - struct buffer_head *si_bh; + unsigned int si_blocks; + struct buffer_head **si_bh; unsigned int si_num_slots; unsigned int si_size; s16 si_global_node_nums[OCFS2_MAX_SLOTS]; @@ -73,7 +74,7 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) /* we don't read the slot block here as ocfs2_super_lock * should've made sure we have the most recent copy. */ spin_lock(&si->si_lock); - disk_info = (__le16 *) si->si_bh->b_data; + disk_info = (__le16 *) si->si_bh[0]->b_data; for (i = 0; i < si->si_size; i++) si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); @@ -85,13 +86,23 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) { int ret; struct ocfs2_slot_info_real *si = to_slot_info(osb); - struct buffer_head *bh; if (si == NULL) return 0; - bh = si->si_bh; - ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode); + BUG_ON(si->si_blocks == 0); + BUG_ON(si->si_bh == NULL); + + mlog(0, "Refreshing slot map, reading %u block(s)\n", + si->si_blocks); + + /* + * We pass -1 as blocknr because we expect all of si->si_bh to + * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If + * this is not true, the read of -1 (UINT64_MAX) will fail. + */ + ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, + si->si_inode); if (ret == 0) ocfs2_update_slot_info(si); @@ -104,20 +115,42 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, struct ocfs2_slot_info_real *si) { int status, i; - __le16 *disk_info = (__le16 *) si->si_bh->b_data; + __le16 *disk_info = (__le16 *) si->si_bh[0]->b_data; spin_lock(&si->si_lock); for (i = 0; i < si->si_size; i++) disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); spin_unlock(&si->si_lock); - status = ocfs2_write_block(osb, si->si_bh, si->si_inode); + status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode); if (status < 0) mlog_errno(status); return status; } +/* + * Calculate how many bytes are needed by the slot map. Returns + * an error if the slot map file is too small. + */ +static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, + struct inode *inode, + unsigned long long *bytes) +{ + unsigned long long bytes_needed; + + bytes_needed = osb->max_slots * sizeof(__le16); + if (bytes_needed < i_size_read(inode)) { + mlog(ML_ERROR, + "Slot map file is too small! (size %llu, needed %llu)\n", + i_size_read(inode), bytes_needed); + return -ENOSPC; + } + + *bytes = bytes_needed; + return 0; +} + /* try to find global node in the slot info. Returns * OCFS2_INVALID_SLOT if nothing is found. */ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info_real *si, @@ -192,13 +225,22 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, static void __ocfs2_free_slot_info(struct ocfs2_slot_info_real *si) { + unsigned int i; + if (si == NULL) return; if (si->si_inode) iput(si->si_inode); - if (si->si_bh) - brelse(si->si_bh); + if (si->si_bh) { + for (i = 0; i < si->si_blocks; i++) { + if (si->si_bh[i]) { + brelse(si->si_bh[i]); + si->si_bh[i] = NULL; + } + } + kfree(si->si_bh); + } kfree(si); } @@ -229,12 +271,65 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) return ocfs2_update_disk_slots(osb, to_slot_info(osb)); } +static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, + struct ocfs2_slot_info_real *si) +{ + int status = 0; + u64 blkno; + unsigned long long blocks, bytes; + unsigned int i; + struct buffer_head *bh; + + status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes); + if (status) + goto bail; + + blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes); + BUG_ON(blocks > UINT_MAX); + si->si_blocks = blocks; + if (!si->si_blocks) + goto bail; + + mlog(0, "Slot map needs %u buffers for %llu bytes\n", + si->si_blocks, bytes); + + si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + GFP_KERNEL); + if (!si->si_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + for (i = 0; i < si->si_blocks; i++) { + status = ocfs2_extent_map_get_blocks(si->si_inode, i, + &blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Reading slot map block %u at %llu\n", i, + (unsigned long long)blkno); + + bh = NULL; /* Acquire a fresh bh */ + status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + si->si_bh[i] = bh; + } + +bail: + return status; +} + int ocfs2_init_slot_info(struct ocfs2_super *osb) { int status, i; - u64 blkno; struct inode *inode = NULL; - struct buffer_head *bh = NULL; struct ocfs2_slot_info_real *si; si = kzalloc(sizeof(struct ocfs2_slot_info_real), GFP_KERNEL); @@ -259,20 +354,13 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - status = ocfs2_read_block(osb, blkno, &bh, 0, inode); + si->si_inode = inode; + status = ocfs2_map_slot_buffers(osb, si); if (status < 0) { mlog_errno(status); goto bail; } - si->si_inode = inode; - si->si_bh = bh; osb->slot_info = (struct ocfs2_slot_info *)si; bail: if (status < 0 && si) -- 1.5.2.2
ocfs2 has a system file called "slot_map". A "slot" is a collection of files local to particular mounted node, including the journal and allocators that node is using. The slot map converts the slot number to a node number, so when a node dies, ocfs2 knows which slot to recover. The old ocfs2 slot map is a very limited. It has a physical maximum of 254 entries - specifically, it must fit within one disk block. It only allows node numbers up to 254, and cannot be extended past INT16_MAX (32767). This is a problem in the world of userspace cluster stacks, where the node numbers are often sparse and can be up to UINT32_MAX. It also has the structural problem that empty slots are signified by a magic number. That number happens to be -1 (0xFFFF). It makes for code that isn't as obvious as one would like. Thus, we introduce a new slot map format, referred to hence as the "extended slot map". The extended slot map is allocated as regular file space, and so is bound by i_size. The new format adds a "valid" field, distinct from the node number. Finally, it has room for extension should it be needed. The kernel patches follow this email. These patches rely on a couple of cleanups that are in the 'cluster_abstractions' branch of ocfs2.git. The cleanup patches are not part of the email thread, but are part of my git repository. The kernel code is available on the 'new-slot-map' branch of my git repository. View: http://oss.oracle.com/git/?p=jlbec/linux-2.6.git;a=shortlog;h=new-slot-map Pull: git pull git://oss.oracle.com/git/jlbec/linux-2.6.git new-slot-map The tools code is also available via git, in the 'new-slot-map' branch as well. View: http://oss.oracle.com/git/?p=ocfs2-tools.git;a=shortlog;h=new-slot-map Pull: git pull git://oss.oracle.com/git/ocfs2-tools.git new-slot-map
Joel Becker
2007-Dec-06 21:21 UTC
[Ocfs2-devel] [PATCH 5/7] ocfs2: De-magic the in-memory slot map.
The in-memory slot map uses the same magic as the on-disk one. There is a special value to mark a slot as invalid. It relies on the size of certain types and so on. Write a new in-memory map that keeps validity as a separate field. Outside of the I/O functions, OCFS2_INVALID_SLOT now means what it is supposed to. It also is no longer tied to the type size. This also means that only the I/O functions refer to 16bit quantities. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/journal.c | 2 +- fs/ocfs2/ocfs2.h | 6 +- fs/ocfs2/slot_map.c | 131 ++++++++++++++++++++++++++++----------------------- fs/ocfs2/slot_map.h | 2 +- 4 files changed, 77 insertions(+), 64 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 89c275e..884ebd9 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -71,7 +71,7 @@ static int ocfs2_commit_thread(void *arg); */ struct ocfs2_recovery_map { - int rm_used; + unsigned int rm_used; unsigned int *rm_entries; }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index dcb9120..5bb5b07 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -214,10 +214,10 @@ struct ocfs2_super unsigned long s_mount_opt; unsigned int s_atime_quantum; - u16 max_slots; + unsigned int max_slots; s16 node_num; - s16 slot_num; - s16 preferred_slot; + int slot_num; + int preferred_slot; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index fd08592..b7d592a 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -42,6 +42,12 @@ #include "buffer_head_io.h" + +struct ocfs2_slot { + int sl_valid; + unsigned int sl_node_num; +}; + struct ocfs2_slot_info_real { spinlock_t si_lock; @@ -49,8 +55,7 @@ struct ocfs2_slot_info_real { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - unsigned int si_size; - s16 si_global_node_nums[OCFS2_MAX_SLOTS]; + struct ocfs2_slot *si_slots; }; static inline struct ocfs2_slot_info_real *to_slot_info(struct ocfs2_super *osb) @@ -59,11 +64,26 @@ static inline struct ocfs2_slot_info_real *to_slot_info(struct ocfs2_super *osb) } -static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info_real *si, - s16 global); -static void __ocfs2_fill_slot(struct ocfs2_slot_info_real *si, - s16 slot_num, - s16 node_num); +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info_real *si, + unsigned int node_num); + +static void ocfs2_invalidate_slot(struct ocfs2_slot_info_real *si, + int slot_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + si->si_slots[slot_num].sl_valid = 0; +} + +static void ocfs2_set_slot(struct ocfs2_slot_info_real *si, + int slot_num, unsigned int node_num) +{ + BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); + BUG_ON((node_num == O2NM_INVALID_NODE_NUM) || + (node_num >= O2NM_MAX_NODES)); + + si->si_slots[slot_num].sl_valid = 1; + si->si_slots[slot_num].sl_node_num = node_num; +} /* post the slot information on disk into our slot_info struct. */ static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) @@ -76,8 +96,12 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) spin_lock(&si->si_lock); disk_info = (__le16 *) si->si_bh[0]->b_data; - for (i = 0; i < si->si_size; i++) - si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); + for (i = 0; i < si->si_num_slots; i++) { + if (le16_to_cpu(disk_info[i]) == (u16)OCFS2_INVALID_SLOT) + ocfs2_invalidate_slot(si, i); + else + ocfs2_set_slot(si, i, le16_to_cpu(disk_info[i])); + } spin_unlock(&si->si_lock); } @@ -118,8 +142,13 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, __le16 *disk_info = (__le16 *) si->si_bh[0]->b_data; spin_lock(&si->si_lock); - for (i = 0; i < si->si_size; i++) - disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); + for (i = 0; i < si->si_num_slots; i++) { + if (si->si_slots[i].sl_valid) + disk_info[i] + cpu_to_le16(si->si_slots[i].sl_node_num); + else + disk_info[i] = cpu_to_le16(OCFS2_INVALID_SLOT); + } spin_unlock(&si->si_lock); status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode); @@ -140,7 +169,7 @@ static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, unsigned long long bytes_needed; bytes_needed = osb->max_slots * sizeof(__le16); - if (bytes_needed < i_size_read(inode)) { + if (bytes_needed > i_size_read(inode)) { mlog(ML_ERROR, "Slot map file is too small! (size %llu, needed %llu)\n", i_size_read(inode), bytes_needed); @@ -151,39 +180,39 @@ static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, return 0; } -/* try to find global node in the slot info. Returns - * OCFS2_INVALID_SLOT if nothing is found. */ -static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info_real *si, - s16 global) +/* try to find global node in the slot info. Returns -ENOENT + * if nothing is found. */ +static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info_real *si, + unsigned int node_num) { - int i; - s16 ret = OCFS2_INVALID_SLOT; + int i, ret = -ENOENT; for(i = 0; i < si->si_num_slots; i++) { - if (global == si->si_global_node_nums[i]) { - ret = (s16) i; + if (si->si_slots[i].sl_valid && + (node_num == si->si_slots[i].sl_node_num)) { + ret = i; break; } } + return ret; } -static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info_real *si, - s16 preferred) +static int __ocfs2_find_empty_slot(struct ocfs2_slot_info_real *si, + int preferred) { - int i; - s16 ret = OCFS2_INVALID_SLOT; + int i, ret = -ENOSPC; - if (preferred >= 0 && preferred < si->si_num_slots) { - if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { + if ((preferred >= 0) && (preferred < si->si_num_slots)) { + if (!si->si_slots[preferred].sl_valid) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { - ret = (s16) i; + if (!si->si_slots[i].sl_valid) { + ret = i; break; } } @@ -193,16 +222,13 @@ out: int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num) { - s16 slot; + int slot; struct ocfs2_slot_info_real *si = to_slot_info(osb); spin_lock(&si->si_lock); slot = __ocfs2_node_num_to_slot(si, node_num); spin_unlock(&si->si_lock); - if (slot == OCFS2_INVALID_SLOT) - return -ENOENT; - return slot; } @@ -216,10 +242,10 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, BUG_ON(slot_num < 0); BUG_ON(slot_num > osb->max_slots); - if (si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT) + if (!si->si_slots[slot_num].sl_valid) return -ENOENT; - *node_num = si->si_global_node_nums[slot_num]; + *node_num = si->si_slots[slot_num].sl_node_num; return 0; } @@ -245,19 +271,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info_real *si) kfree(si); } -static void __ocfs2_fill_slot(struct ocfs2_slot_info_real *si, - s16 slot_num, - s16 node_num) -{ - BUG_ON(slot_num == OCFS2_INVALID_SLOT); - BUG_ON(slot_num >= si->si_num_slots); - BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && - (node_num >= O2NM_MAX_NODES)); - - si->si_global_node_nums[slot_num] = node_num; -} - -int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num) { struct ocfs2_slot_info_real *si = to_slot_info(osb); @@ -265,7 +279,7 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) return 0; spin_lock(&si->si_lock); - __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); + ocfs2_invalidate_slot(si, slot_num); spin_unlock(&si->si_lock); return ocfs2_update_disk_slots(osb, to_slot_info(osb)); @@ -328,11 +342,13 @@ bail: int ocfs2_init_slot_info(struct ocfs2_super *osb) { - int status, i; + int status; struct inode *inode = NULL; struct ocfs2_slot_info_real *si; - si = kzalloc(sizeof(struct ocfs2_slot_info_real), GFP_KERNEL); + si = kzalloc(sizeof(struct ocfs2_slot_info_real) + + (sizeof(struct ocfs2_slot) * osb->max_slots), + GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); @@ -341,10 +357,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) spin_lock_init(&si->si_lock); si->si_num_slots = osb->max_slots; - si->si_size = OCFS2_MAX_SLOTS; - - for(i = 0; i < si->si_num_slots; i++) - si->si_global_node_nums[i] = OCFS2_INVALID_SLOT; + si->si_slots = (struct ocfs2_slot *)((char *)si + sizeof(struct ocfs2_slot_info_real)); inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -380,7 +393,7 @@ void ocfs2_free_slot_info(struct ocfs2_super *osb) int ocfs2_find_slot(struct ocfs2_super *osb) { int status; - s16 slot; + int slot; struct ocfs2_slot_info_real *si; mlog_entry_void(); @@ -395,11 +408,11 @@ int ocfs2_find_slot(struct ocfs2_super *osb) * own journal recovery? Possibly not, though we certainly * need to warn to the user */ slot = __ocfs2_node_num_to_slot(si, osb->node_num); - if (slot == OCFS2_INVALID_SLOT) { + if (slot < 0) { /* if no slot yet, then just take 1st available * one. */ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); - if (slot == OCFS2_INVALID_SLOT) { + if (slot < 0) { spin_unlock(&si->si_lock); mlog(ML_ERROR, "no free slots available!\n"); status = -EINVAL; @@ -409,7 +422,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb) mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", slot); - __ocfs2_fill_slot(si, slot, osb->node_num); + ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; spin_unlock(&si->si_lock); @@ -435,7 +448,7 @@ void ocfs2_put_slot(struct ocfs2_super *osb) ocfs2_update_slot_info(si); spin_lock(&si->si_lock); - __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); + ocfs2_invalidate_slot(si, osb->slot_num); osb->slot_num = OCFS2_INVALID_SLOT; spin_unlock(&si->si_lock); diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index 14b065b..25ef6cf 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -43,6 +43,6 @@ int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num); int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, unsigned int *node_num); -int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num); +int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num); #endif -- 1.5.2.2
The old slot map had a few limitations: - It was limited to one block, so the maximum slot count was 255. - Each slot was signed 16bits, limiting node numbers to INT16_MAX. - An empty slot was marked by the magic 0xFFFF (-1). The new slot map format provides 32bit node numbers (UINT32_MAX), a separate space to mark a slot in use, and extra room to grow. The slot map is now bounded by i_size, not a block. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/ocfs2.h | 6 +++ fs/ocfs2/ocfs2_fs.h | 31 ++++++++++++++- fs/ocfs2/slot_map.c | 107 ++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 128 insertions(+), 16 deletions(-) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 5bb5b07..36a9e09 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -363,6 +363,12 @@ static inline int ocfs2_mount_local(struct ocfs2_super *osb) return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); } +static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); +} + + #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 26a5565..85a2b21 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -88,7 +88,8 @@ #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ - | OCFS2_FEATURE_INCOMPAT_INLINE_DATA) + | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ + | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP) #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN /* @@ -125,6 +126,10 @@ /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -454,7 +459,8 @@ struct ocfs2_extent_block /* * On disk slot map for OCFS2. This defines the contents of the "slot_map" - * system file. + * system file. A slot is valid if it contains a node number >= 0. The + * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { /*00*/ __le16 sm_slots[0]; @@ -464,6 +470,27 @@ struct ocfs2_slot_map { */ }; +struct ocfs2_extended_slot { +/*00*/ __u8 es_valid; + __u8 es_reserved1[3]; + __le32 es_node_num; +/*10*/ +}; + +/* + * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP + * is set. It separates out the valid marker from the node number, and + * has room to grow. Unlike the old slot map, this format is defined by + * i_size. + */ +struct ocfs2_slot_map_extended { +/*00*/ struct ocfs2_extended_slot se_slots[0]; +/* + * Actual size is i_size of the slot_map system file. It should + * match s_max_slots * sizeof(struct ocfs2_extended_slot) + */ +}; + /* * On disk superblock for OCFS2 * Note that it is contained inside an ocfs2_dinode, so all offsets diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index a458d08..005a7dc 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -51,6 +51,8 @@ struct ocfs2_slot { struct ocfs2_slot_info_real { spinlock_t si_lock; + int si_extended; + int si_slots_per_block; struct inode *si_inode; unsigned int si_blocks; struct buffer_head **si_bh; @@ -85,15 +87,33 @@ static void ocfs2_set_slot(struct ocfs2_slot_info_real *si, si->si_slots[slot_num].sl_node_num = node_num; } +/* This version is for the extended slot map */ +static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info_real *si) +{ + int b, i, slotno; + struct ocfs2_slot_map_extended *se; + + slotno = 0; + for (b = 0; b < si->si_blocks; b++) { + se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data; + for (i = 0; + (i < si->si_slots_per_block) && (slotno < si->si_num_slots); + i++, slotno++) { + if (se->se_slots[i].es_valid) + ocfs2_set_slot(si, slotno, + le32_to_cpu(se->se_slots[i].es_node_num)); + else + ocfs2_invalidate_slot(si, slotno); + } + } +} + /* post the slot information on disk into our slot_info struct. */ -static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) +static void ocfs2_update_slot_info_old(struct ocfs2_slot_info_real *si) { int i; struct ocfs2_slot_map *sm; - /* we don't read the slot block here as ocfs2_super_lock - * should've made sure we have the most recent copy. */ - spin_lock(&si->si_lock); sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; for (i = 0; i < si->si_num_slots; i++) { @@ -102,7 +122,17 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) else ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i])); } +} +static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) +{ + /* we don't read the slot block here as ocfs2_super_lock + * should've made sure we have the most recent copy. */ + spin_lock(&si->si_lock); + if (si->si_extended) + ocfs2_update_slot_info_extended(si); + else + ocfs2_update_slot_info_old(si); spin_unlock(&si->si_lock); } @@ -135,13 +165,31 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) /* post the our slot info stuff into it's destination bh and write it * out. */ -static int ocfs2_update_disk_slots(struct ocfs2_super *osb, - struct ocfs2_slot_info_real *si) +static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info_real *si, + int slot_num, + struct buffer_head **bh) +{ + int blkind = slot_num / si->si_slots_per_block; + int slotno = slot_num % si->si_slots_per_block; + struct ocfs2_slot_map_extended *se; + + BUG_ON(blkind >= si->si_blocks); + + se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data; + se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid; + if (si->si_slots[slot_num].sl_valid) + se->se_slots[slotno].es_node_num + cpu_to_le32(si->si_slots[slot_num].sl_node_num); + *bh = si->si_bh[blkind]; +} + +static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info_real *si, + int slot_num, + struct buffer_head **bh) { - int status, i; + int i; struct ocfs2_slot_map *sm; - spin_lock(&si->si_lock); sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; for (i = 0; i < si->si_num_slots; i++) { if (si->si_slots[i].sl_valid) @@ -150,9 +198,24 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, else sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT); } + *bh = si->si_bh[0]; +} + +static int ocfs2_update_disk_slot(struct ocfs2_super *osb, + struct ocfs2_slot_info_real *si, + int slot_num) +{ + int status; + struct buffer_head *bh; + + spin_lock(&si->si_lock); + if (si->si_extended) + ocfs2_update_disk_slot_extended(si, slot_num, &bh); + else + ocfs2_update_disk_slot_old(si, slot_num, &bh); spin_unlock(&si->si_lock); - status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode); + status = ocfs2_write_block(osb, bh, si->si_inode); if (status < 0) mlog_errno(status); @@ -169,7 +232,12 @@ static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, { unsigned long long bytes_needed; - bytes_needed = osb->max_slots * sizeof(__le16); + if (ocfs2_uses_extended_slot_map(osb)) { + bytes_needed = osb->max_slots * + sizeof(struct ocfs2_extended_slot); + } else { + bytes_needed = osb->max_slots * sizeof(__le16); + } if (bytes_needed > i_size_read(inode)) { mlog(ML_ERROR, "Slot map file is too small! (size %llu, needed %llu)\n", @@ -283,7 +351,7 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num) ocfs2_invalidate_slot(si, slot_num); spin_unlock(&si->si_lock); - return ocfs2_update_disk_slots(osb, to_slot_info(osb)); + return ocfs2_update_disk_slot(osb, to_slot_info(osb), slot_num); } static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, @@ -305,6 +373,15 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, if (!si->si_blocks) goto bail; + if (si->si_extended) + si->si_slots_per_block + osb->sb->s_blocksize / sizeof(struct ocfs2_extended_slot); + else + si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16); + + /* The size checks above should ensure this */ + BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks); + mlog(0, "Slot map needs %u buffers for %llu bytes\n", si->si_blocks, bytes); @@ -357,6 +434,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) } spin_lock_init(&si->si_lock); + si->si_extended = ocfs2_uses_extended_slot_map(osb); si->si_num_slots = osb->max_slots; si->si_slots = (struct ocfs2_slot *)((char *)si + sizeof(struct ocfs2_slot_info_real)); @@ -429,7 +507,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb) mlog(0, "taking node slot %d\n", osb->slot_num); - status = ocfs2_update_disk_slots(osb, si); + status = ocfs2_update_disk_slot(osb, si, osb->slot_num); if (status < 0) mlog_errno(status); @@ -440,7 +518,7 @@ bail: void ocfs2_put_slot(struct ocfs2_super *osb) { - int status; + int status, slot_num; struct ocfs2_slot_info_real *si = to_slot_info(osb); if (!si) @@ -449,11 +527,12 @@ void ocfs2_put_slot(struct ocfs2_super *osb) ocfs2_update_slot_info(si); spin_lock(&si->si_lock); + slot_num = osb->slot_num; ocfs2_invalidate_slot(si, osb->slot_num); osb->slot_num = OCFS2_INVALID_SLOT; spin_unlock(&si->si_lock); - status = ocfs2_update_disk_slots(osb, si); + status = ocfs2_update_disk_slot(osb, si, slot_num); if (status < 0) { mlog_errno(status); goto bail; -- 1.5.2.2
Joel Becker
2007-Dec-06 21:21 UTC
[Ocfs2-devel] [PATCH 6/7] ocfs2: Define the contents of the slot_map file.
The slot map file is merely an array of __le16. Wrap it in a structure for cleaner reference. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/ocfs2_fs.h | 12 ++++++++++++ fs/ocfs2/slot_map.c | 15 ++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 6ef8767..26a5565 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -453,6 +453,18 @@ struct ocfs2_extent_block }; /* + * On disk slot map for OCFS2. This defines the contents of the "slot_map" + * system file. + */ +struct ocfs2_slot_map { +/*00*/ __le16 sm_slots[0]; +/* + * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, + * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. + */ +}; + +/* * On disk superblock for OCFS2 * Note that it is contained inside an ocfs2_dinode, so all offsets * are relative to the start of ocfs2_dinode.id2. diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index b7d592a..a458d08 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -89,18 +89,18 @@ static void ocfs2_set_slot(struct ocfs2_slot_info_real *si, static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) { int i; - __le16 *disk_info; + struct ocfs2_slot_map *sm; /* we don't read the slot block here as ocfs2_super_lock * should've made sure we have the most recent copy. */ spin_lock(&si->si_lock); - disk_info = (__le16 *) si->si_bh[0]->b_data; + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; for (i = 0; i < si->si_num_slots; i++) { - if (le16_to_cpu(disk_info[i]) == (u16)OCFS2_INVALID_SLOT) + if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT) ocfs2_invalidate_slot(si, i); else - ocfs2_set_slot(si, i, le16_to_cpu(disk_info[i])); + ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i])); } spin_unlock(&si->si_lock); @@ -139,15 +139,16 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, struct ocfs2_slot_info_real *si) { int status, i; - __le16 *disk_info = (__le16 *) si->si_bh[0]->b_data; + struct ocfs2_slot_map *sm; spin_lock(&si->si_lock); + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; for (i = 0; i < si->si_num_slots; i++) { if (si->si_slots[i].sl_valid) - disk_info[i] + sm->sm_slots[i] cpu_to_le16(si->si_slots[i].sl_node_num); else - disk_info[i] = cpu_to_le16(OCFS2_INVALID_SLOT); + sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT); } spin_unlock(&si->si_lock); -- 1.5.2.2
Joel Becker
2007-Dec-06 21:21 UTC
[Ocfs2-devel] [PATCH 2/7] ocfs2: Wrap all other access to ocfs2_slot_info.
The last place to reference naked slot_info fields is ocfs2_mark_nodes_dirty(). It's part of the mount path, and it races node down events, so the locking must be honored. We basically wrap the check for what node is in each slot, thus allowing us to change the mechanism on the backend. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/journal.c | 15 +++++---- fs/ocfs2/slot_map.c | 84 +++++++++++++++++++++++++++++++++++++-------------- fs/ocfs2/slot_map.h | 21 ++---------- 3 files changed, 73 insertions(+), 47 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index da63375..c1d692b 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1065,7 +1065,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, { int status = 0; int slot_num; - struct ocfs2_slot_info *si = osb->slot_info; struct ocfs2_dinode *la_copy = NULL; struct ocfs2_dinode *tl_copy = NULL; @@ -1078,8 +1077,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, * case we should've called ocfs2_journal_load instead. */ BUG_ON(osb->node_num == node_num); - slot_num = ocfs2_node_num_to_slot(si, node_num); - if (slot_num == OCFS2_INVALID_SLOT) { + slot_num = ocfs2_node_num_to_slot(osb, node_num); + if (slot_num == -ENOENT) { status = 0; mlog(0, "no slot for this node, so no recovery required.\n"); goto done; @@ -1169,20 +1168,22 @@ bail: * slot info struct has been updated from disk. */ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) { - int status, i, node_num; + unsigned int node_num; + int status, i; struct ocfs2_slot_info *si = osb->slot_info; /* This is called with the super block cluster lock, so we * know that the slot map can't change underneath us. */ spin_lock(&si->si_lock); - for(i = 0; i < si->si_num_slots; i++) { + for(i = 0; i < osb->max_slots; i++) { if (i == osb->slot_num) continue; - if (ocfs2_is_empty_slot(si, i)) + + status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); + if (status == -ENOENT) continue; - node_num = si->si_global_node_nums[i]; if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) continue; spin_unlock(&si->si_lock); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index f5727b8..a4f2c02 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -42,14 +42,30 @@ #include "buffer_head_io.h" -static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, +struct ocfs2_slot_info_real { + spinlock_t si_lock; + + struct inode *si_inode; + struct buffer_head *si_bh; + unsigned int si_num_slots; + unsigned int si_size; + s16 si_global_node_nums[OCFS2_MAX_SLOTS]; +}; + +static inline struct ocfs2_slot_info_real *to_slot_info(struct ocfs2_super *osb) +{ + return (struct ocfs2_slot_info_real *)osb->slot_info; +} + + +static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info_real *si, s16 global); -static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, +static void __ocfs2_fill_slot(struct ocfs2_slot_info_real *si, s16 slot_num, s16 node_num); /* post the slot information on disk into our slot_info struct. */ -static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) +static void ocfs2_update_slot_info(struct ocfs2_slot_info_real *si) { int i; __le16 *disk_info; @@ -68,7 +84,7 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) int ocfs2_refresh_slot_info(struct ocfs2_super *osb) { int ret; - struct ocfs2_slot_info *si = osb->slot_info; + struct ocfs2_slot_info_real *si = to_slot_info(osb); struct buffer_head *bh; if (si == NULL) @@ -85,7 +101,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) /* post the our slot info stuff into it's destination bh and write it * out. */ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, - struct ocfs2_slot_info *si) + struct ocfs2_slot_info_real *si) { int status, i; __le16 *disk_info = (__le16 *) si->si_bh->b_data; @@ -104,7 +120,7 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, /* try to find global node in the slot info. Returns * OCFS2_INVALID_SLOT if nothing is found. */ -static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, +static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info_real *si, s16 global) { int i; @@ -119,7 +135,8 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, return ret; } -static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) +static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info_real *si, + s16 preferred) { int i; s16 ret = OCFS2_INVALID_SLOT; @@ -141,18 +158,39 @@ out: return ret; } -s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global) +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num) { - s16 ret; + s16 slot; + struct ocfs2_slot_info_real *si = to_slot_info(osb); spin_lock(&si->si_lock); - ret = __ocfs2_node_num_to_slot(si, global); + slot = __ocfs2_node_num_to_slot(si, node_num); spin_unlock(&si->si_lock); - return ret; + + if (slot == OCFS2_INVALID_SLOT) + return -ENOENT; + + return slot; +} + +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num) +{ + struct ocfs2_slot_info_real *si = to_slot_info(osb); + + assert_spin_locked(&si->si_lock); + + BUG_ON(slot_num < 0); + BUG_ON(slot_num > osb->max_slots); + + if (si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT) + return -ENOENT; + + *node_num = si->si_global_node_nums[slot_num]; + return 0; } -static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) +static void __ocfs2_free_slot_info(struct ocfs2_slot_info_real *si) { if (si == NULL) return; @@ -165,7 +203,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) kfree(si); } -static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, +static void __ocfs2_fill_slot(struct ocfs2_slot_info_real *si, s16 slot_num, s16 node_num) { @@ -179,7 +217,7 @@ static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) { - struct ocfs2_slot_info *si = osb->slot_info; + struct ocfs2_slot_info_real *si = to_slot_info(osb); if (si == NULL) return 0; @@ -188,7 +226,7 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); spin_unlock(&si->si_lock); - return ocfs2_update_disk_slots(osb, osb->slot_info); + return ocfs2_update_disk_slots(osb, to_slot_info(osb)); } int ocfs2_init_slot_info(struct ocfs2_super *osb) @@ -197,9 +235,9 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) u64 blkno; struct inode *inode = NULL; struct buffer_head *bh = NULL; - struct ocfs2_slot_info *si; + struct ocfs2_slot_info_real *si; - si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); + si = kzalloc(sizeof(struct ocfs2_slot_info_real), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); @@ -235,7 +273,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) si->si_inode = inode; si->si_bh = bh; - osb->slot_info = si; + osb->slot_info = (struct ocfs2_slot_info *)si; bail: if (status < 0 && si) __ocfs2_free_slot_info(si); @@ -245,7 +283,7 @@ bail: void ocfs2_free_slot_info(struct ocfs2_super *osb) { - struct ocfs2_slot_info *si = osb->slot_info; + struct ocfs2_slot_info_real *si = to_slot_info(osb); osb->slot_info = NULL; __ocfs2_free_slot_info(si); @@ -255,11 +293,11 @@ int ocfs2_find_slot(struct ocfs2_super *osb) { int status; s16 slot; - struct ocfs2_slot_info *si; + struct ocfs2_slot_info_real *si; mlog_entry_void(); - si = osb->slot_info; + si = to_slot_info(osb); ocfs2_update_slot_info(si); @@ -301,7 +339,7 @@ bail: void ocfs2_put_slot(struct ocfs2_super *osb) { int status; - struct ocfs2_slot_info *si = osb->slot_info; + struct ocfs2_slot_info_real *si = to_slot_info(osb); if (!si) return; diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index b029ffd..14b065b 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -29,12 +29,6 @@ struct ocfs2_slot_info { spinlock_t si_lock; - - struct inode *si_inode; - struct buffer_head *si_bh; - unsigned int si_num_slots; - unsigned int si_size; - s16 si_global_node_nums[OCFS2_MAX_SLOTS]; }; int ocfs2_init_slot_info(struct ocfs2_super *osb); @@ -45,17 +39,10 @@ void ocfs2_put_slot(struct ocfs2_super *osb); int ocfs2_refresh_slot_info(struct ocfs2_super *osb); -s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global); -int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num); - -static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, - int slot_num) -{ - BUG_ON(slot_num == OCFS2_INVALID_SLOT); - assert_spin_locked(&si->si_lock); +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num); +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num); - return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; -} +int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num); #endif -- 1.5.2.2
Mark Fasheh
2007-Dec-31 16:49 UTC
[Ocfs2-devel] [PATCH 4/7] ocfs2: slot_map I/O based on i_size.
On Thu, Dec 06, 2007 at 09:19:44PM -0800, Joel Becker wrote:> The slot map code assumed a slot_map file has one block allocated. > This changes the code to I/O as many blocks as will cover i_size. > > Signed-off-by: Joel Becker <joel.becker@oracle.com>Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> -- Mark Fasheh Principal Software Developer, Oracle mark.fasheh@oracle.com
Joel Becker
2008-Jan-03 13:41 UTC
[Ocfs2-devel] [PATCH 4/7] ocfs2: slot_map I/O based on max_slots.
The slot map code assumed a slot_map file has one block allocated. This changes the code to I/O as many blocks as will cover max_slots. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/slot_map.c | 128 +++++++++++++++++++++++++++++++++++++++++++-------- 1 files changed, 108 insertions(+), 20 deletions(-) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 762360d..5bddee1 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -44,7 +44,8 @@ struct ocfs2_slot_info { struct inode *si_inode; - struct buffer_head *si_bh; + unsigned int si_blocks; + struct buffer_head **si_bh; unsigned int si_num_slots; unsigned int si_size; s16 si_global_node_nums[OCFS2_MAX_SLOTS]; @@ -68,7 +69,7 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) /* we don't read the slot block here as ocfs2_super_lock * should've made sure we have the most recent copy. */ - disk_info = (__le16 *) si->si_bh->b_data; + disk_info = (__le16 *) si->si_bh[0]->b_data; for (i = 0; i < si->si_size; i++) si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); @@ -78,13 +79,23 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) { int ret; struct ocfs2_slot_info *si = osb->slot_info; - struct buffer_head *bh; if (si == NULL) return 0; - bh = si->si_bh; - ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode); + BUG_ON(si->si_blocks == 0); + BUG_ON(si->si_bh == NULL); + + mlog(0, "Refreshing slot map, reading %u block(s)\n", + si->si_blocks); + + /* + * We pass -1 as blocknr because we expect all of si->si_bh to + * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If + * this is not true, the read of -1 (UINT64_MAX) will fail. + */ + ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, + si->si_inode); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -100,20 +111,42 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, struct ocfs2_slot_info *si) { int status, i; - __le16 *disk_info = (__le16 *) si->si_bh->b_data; + __le16 *disk_info = (__le16 *) si->si_bh[0]->b_data; spin_lock(&osb->osb_lock); for (i = 0; i < si->si_size; i++) disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); spin_unlock(&osb->osb_lock); - status = ocfs2_write_block(osb, si->si_bh, si->si_inode); + status = ocfs2_write_block(osb, si->si_bh[0], si->si_inode); if (status < 0) mlog_errno(status); return status; } +/* + * Calculate how many bytes are needed by the slot map. Returns + * an error if the slot map file is too small. + */ +static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, + struct inode *inode, + unsigned long long *bytes) +{ + unsigned long long bytes_needed; + + bytes_needed = osb->max_slots * sizeof(__le16); + if (bytes_needed > i_size_read(inode)) { + mlog(ML_ERROR, + "Slot map file is too small! (size %llu, needed %llu)\n", + i_size_read(inode), bytes_needed); + return -ENOSPC; + } + + *bytes = bytes_needed; + return 0; +} + /* try to find global node in the slot info. Returns * OCFS2_INVALID_SLOT if nothing is found. */ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, @@ -188,13 +221,22 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) { + unsigned int i; + if (si == NULL) return; if (si->si_inode) iput(si->si_inode); - if (si->si_bh) - brelse(si->si_bh); + if (si->si_bh) { + for (i = 0; i < si->si_blocks; i++) { + if (si->si_bh[i]) { + brelse(si->si_bh[i]); + si->si_bh[i] = NULL; + } + } + kfree(si->si_bh); + } kfree(si); } @@ -225,12 +267,65 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) return ocfs2_update_disk_slots(osb, osb->slot_info); } +static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, + struct ocfs2_slot_info *si) +{ + int status = 0; + u64 blkno; + unsigned long long blocks, bytes; + unsigned int i; + struct buffer_head *bh; + + status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes); + if (status) + goto bail; + + blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes); + BUG_ON(blocks > UINT_MAX); + si->si_blocks = blocks; + if (!si->si_blocks) + goto bail; + + mlog(0, "Slot map needs %u buffers for %llu bytes\n", + si->si_blocks, bytes); + + si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + GFP_KERNEL); + if (!si->si_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + for (i = 0; i < si->si_blocks; i++) { + status = ocfs2_extent_map_get_blocks(si->si_inode, i, + &blkno, NULL, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Reading slot map block %u at %llu\n", i, + (unsigned long long)blkno); + + bh = NULL; /* Acquire a fresh bh */ + status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + si->si_bh[i] = bh; + } + +bail: + return status; +} + int ocfs2_init_slot_info(struct ocfs2_super *osb) { int status, i; - u64 blkno; struct inode *inode = NULL; - struct buffer_head *bh = NULL; struct ocfs2_slot_info *si; si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); @@ -254,20 +349,13 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - status = ocfs2_read_block(osb, blkno, &bh, 0, inode); + si->si_inode = inode; + status = ocfs2_map_slot_buffers(osb, si); if (status < 0) { mlog_errno(status); goto bail; } - si->si_inode = inode; - si->si_bh = bh; osb->slot_info = (struct ocfs2_slot_info *)si; bail: if (status < 0 && si) -- 1.5.2.2
Joel Becker
2008-Jan-03 13:41 UTC
[Ocfs2-devel] [PATCH 2/7] ocfs2: Make ocfs2_slot_info private.
Just use osb_lock around the ocfs2_slot_info data. This allows us to take the ocfs2_slot_info structure private in slot_info.c. All access is now via accessors. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/journal.c | 24 +++++++------- fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/slot_map.c | 81 ++++++++++++++++++++++++++++++++++++--------------- fs/ocfs2/slot_map.h | 25 ++------------- 4 files changed, 74 insertions(+), 57 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index da63375..322b981 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1065,7 +1065,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, { int status = 0; int slot_num; - struct ocfs2_slot_info *si = osb->slot_info; struct ocfs2_dinode *la_copy = NULL; struct ocfs2_dinode *tl_copy = NULL; @@ -1078,8 +1077,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, * case we should've called ocfs2_journal_load instead. */ BUG_ON(osb->node_num == node_num); - slot_num = ocfs2_node_num_to_slot(si, node_num); - if (slot_num == OCFS2_INVALID_SLOT) { + slot_num = ocfs2_node_num_to_slot(osb, node_num); + if (slot_num == -ENOENT) { status = 0; mlog(0, "no slot for this node, so no recovery required.\n"); goto done; @@ -1169,23 +1168,24 @@ bail: * slot info struct has been updated from disk. */ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) { - int status, i, node_num; - struct ocfs2_slot_info *si = osb->slot_info; + unsigned int node_num; + int status, i; /* This is called with the super block cluster lock, so we * know that the slot map can't change underneath us. */ - spin_lock(&si->si_lock); - for(i = 0; i < si->si_num_slots; i++) { + spin_lock(&osb->osb_lock); + for(i = 0; i < osb->max_slots; i++) { if (i == osb->slot_num) continue; - if (ocfs2_is_empty_slot(si, i)) + + status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); + if (status == -ENOENT) continue; - node_num = si->si_global_node_nums[i]; if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) continue; - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); /* Ok, we have a slot occupied by another node which * is not in the recovery map. We trylock his journal @@ -1201,9 +1201,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) goto bail; } - spin_lock(&si->si_lock); + spin_lock(&osb->osb_lock); } - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); status = 0; bail: diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index f8f8661..67867b4 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -178,6 +178,7 @@ enum ocfs2_mount_options #define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; +struct ocfs2_slot_info; struct ocfs2_super { struct task_struct *commit_task; diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index f5727b8..762360d 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -42,13 +42,25 @@ #include "buffer_head_io.h" +struct ocfs2_slot_info { + struct inode *si_inode; + struct buffer_head *si_bh; + unsigned int si_num_slots; + unsigned int si_size; + s16 si_global_node_nums[OCFS2_MAX_SLOTS]; +}; + + static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, s16 global); static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, s16 slot_num, s16 node_num); -/* post the slot information on disk into our slot_info struct. */ +/* + * Post the slot information on disk into our slot_info struct. + * Must be protected by osb_lock. + */ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) { int i; @@ -56,13 +68,10 @@ static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) /* we don't read the slot block here as ocfs2_super_lock * should've made sure we have the most recent copy. */ - spin_lock(&si->si_lock); disk_info = (__le16 *) si->si_bh->b_data; for (i = 0; i < si->si_size; i++) si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); - - spin_unlock(&si->si_lock); } int ocfs2_refresh_slot_info(struct ocfs2_super *osb) @@ -76,8 +85,11 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) bh = si->si_bh; ret = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, si->si_inode); - if (ret == 0) + if (ret == 0) { + spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); + spin_unlock(&osb->osb_lock); + } return ret; } @@ -90,10 +102,10 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, int status, i; __le16 *disk_info = (__le16 *) si->si_bh->b_data; - spin_lock(&si->si_lock); + spin_lock(&osb->osb_lock); for (i = 0; i < si->si_size; i++) disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); status = ocfs2_write_block(osb, si->si_bh, si->si_inode); if (status < 0) @@ -119,7 +131,8 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, return ret; } -static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) +static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, + s16 preferred) { int i; s16 ret = OCFS2_INVALID_SLOT; @@ -141,15 +154,36 @@ out: return ret; } -s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global) +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num) { - s16 ret; + s16 slot; + struct ocfs2_slot_info *si = osb->slot_info; - spin_lock(&si->si_lock); - ret = __ocfs2_node_num_to_slot(si, global); - spin_unlock(&si->si_lock); - return ret; + spin_lock(&osb->osb_lock); + slot = __ocfs2_node_num_to_slot(si, node_num); + spin_unlock(&osb->osb_lock); + + if (slot == OCFS2_INVALID_SLOT) + return -ENOENT; + + return slot; +} + +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num) +{ + struct ocfs2_slot_info *si = osb->slot_info; + + assert_spin_locked(&osb->osb_lock); + + BUG_ON(slot_num < 0); + BUG_ON(slot_num > osb->max_slots); + + if (si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT) + return -ENOENT; + + *node_num = si->si_global_node_nums[slot_num]; + return 0; } static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) @@ -184,9 +218,9 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num) if (si == NULL) return 0; - spin_lock(&si->si_lock); + spin_lock(&osb->osb_lock); __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); return ocfs2_update_disk_slots(osb, osb->slot_info); } @@ -206,7 +240,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) goto bail; } - spin_lock_init(&si->si_lock); si->si_num_slots = osb->max_slots; si->si_size = OCFS2_MAX_SLOTS; @@ -235,7 +268,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) si->si_inode = inode; si->si_bh = bh; - osb->slot_info = si; + osb->slot_info = (struct ocfs2_slot_info *)si; bail: if (status < 0 && si) __ocfs2_free_slot_info(si); @@ -261,9 +294,9 @@ int ocfs2_find_slot(struct ocfs2_super *osb) si = osb->slot_info; + spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - spin_lock(&si->si_lock); /* search for ourselves first and take the slot if it already * exists. Perhaps we need to mark this in a variable for our * own journal recovery? Possibly not, though we certainly @@ -274,7 +307,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb) * one. */ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); if (slot == OCFS2_INVALID_SLOT) { - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); mlog(ML_ERROR, "no free slots available!\n"); status = -EINVAL; goto bail; @@ -285,7 +318,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb) __ocfs2_fill_slot(si, slot, osb->node_num); osb->slot_num = slot; - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); mlog(0, "taking node slot %d\n", osb->slot_num); @@ -306,12 +339,12 @@ void ocfs2_put_slot(struct ocfs2_super *osb) if (!si) return; + spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - spin_lock(&si->si_lock); __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); osb->slot_num = OCFS2_INVALID_SLOT; - spin_unlock(&si->si_lock); + spin_unlock(&osb->osb_lock); status = ocfs2_update_disk_slots(osb, si); if (status < 0) { diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index b029ffd..5118e89 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -27,16 +27,6 @@ #ifndef SLOTMAP_H #define SLOTMAP_H -struct ocfs2_slot_info { - spinlock_t si_lock; - - struct inode *si_inode; - struct buffer_head *si_bh; - unsigned int si_num_slots; - unsigned int si_size; - s16 si_global_node_nums[OCFS2_MAX_SLOTS]; -}; - int ocfs2_init_slot_info(struct ocfs2_super *osb); void ocfs2_free_slot_info(struct ocfs2_super *osb); @@ -45,17 +35,10 @@ void ocfs2_put_slot(struct ocfs2_super *osb); int ocfs2_refresh_slot_info(struct ocfs2_super *osb); -s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, - s16 global); -int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num); +int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num); +int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, + unsigned int *node_num); -static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, - int slot_num) -{ - BUG_ON(slot_num == OCFS2_INVALID_SLOT); - assert_spin_locked(&si->si_lock); - - return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; -} +int ocfs2_clear_slot(struct ocfs2_super *osb, s16 slot_num); #endif -- 1.5.2.2
Joel Becker
2008-Jan-03 13:41 UTC
[Ocfs2-devel] [PATCH 6/7] ocfs2: Define the contents of the slot_map file.
The slot map file is merely an array of __le16. Wrap it in a structure for cleaner reference. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/ocfs2_fs.h | 12 ++++++++++++ fs/ocfs2/slot_map.c | 15 ++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 6ef8767..26a5565 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -453,6 +453,18 @@ struct ocfs2_extent_block }; /* + * On disk slot map for OCFS2. This defines the contents of the "slot_map" + * system file. + */ +struct ocfs2_slot_map { +/*00*/ __le16 sm_slots[0]; +/* + * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, + * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. + */ +}; + +/* * On disk superblock for OCFS2 * Note that it is contained inside an ocfs2_dinode, so all offsets * are relative to the start of ocfs2_dinode.id2. diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 3af880f..12550fe 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -85,17 +85,17 @@ static void ocfs2_set_slot(struct ocfs2_slot_info *si, static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) { int i; - __le16 *disk_info; + struct ocfs2_slot_map *sm; /* we don't read the slot block here as ocfs2_super_lock * should've made sure we have the most recent copy. */ - disk_info = (__le16 *) si->si_bh[0]->b_data; + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; for (i = 0; i < si->si_num_slots; i++) { - if (le16_to_cpu(disk_info[i]) == (u16)OCFS2_INVALID_SLOT) + if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT) ocfs2_invalidate_slot(si, i); else - ocfs2_set_slot(si, i, le16_to_cpu(disk_info[i])); + ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i])); } } @@ -135,15 +135,16 @@ static int ocfs2_update_disk_slots(struct ocfs2_super *osb, struct ocfs2_slot_info *si) { int status, i; - __le16 *disk_info = (__le16 *) si->si_bh[0]->b_data; + struct ocfs2_slot_map *sm; spin_lock(&osb->osb_lock); + sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; for (i = 0; i < si->si_num_slots; i++) { if (si->si_slots[i].sl_valid) - disk_info[i] + sm->sm_slots[i] cpu_to_le16(si->si_slots[i].sl_node_num); else - disk_info[i] = cpu_to_le16(OCFS2_INVALID_SLOT); + sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT); } spin_unlock(&osb->osb_lock); -- 1.5.2.2
ocfs2 has a system file called "slot_map". A "slot" is a collection of files local to particular mounted node, including the journal and allocators that node is using. The slot map converts the slot number to a node number, so when a node dies, ocfs2 knows which slot to recover. The old ocfs2 slot map is a very limited. It has a physical maximum of 254 entries - specifically, it must fit within one disk block. It only allows node numbers up to 254, and cannot be extended past INT16_MAX (32767). This is a problem in the world of userspace cluster stacks, where the node numbers are often sparse and can be up to UINT32_MAX. It also has the structural problem that empty slots are signified by a magic number. That number happens to be -1 (0xFFFF). It makes for code that isn't as obvious as one would like. Thus, we introduce a new slot map format, referred to hence as the "extended slot map". The extended slot map is allocated as regular file space, and so is bound by i_size. The new format adds a "valid" field, distinct from the node number. Finally, it has room for extension should it be needed. The kernel patches follow this email. These patches rely on a couple of cleanups that are in the 'cluster_abstractions' branch of ocfs2.git. The cleanup patches are not part of the email thread, but are part of my git repository. The kernel code is available on the 'new-slot-map' branch of my git repository. View: http://oss.oracle.com/git/?p=jlbec/linux-2.6.git;a=shortlog;h=new-slot-map Pull: git pull git://oss.oracle.com/git/jlbec/linux-2.6.git new-slot-map The tools code is also available via git, in the 'new-slot-map' branch as well. View: http://oss.oracle.com/git/?p=ocfs2-tools.git;a=shortlog;h=new-slot-map Pull: git pull git://oss.oracle.com/git/ocfs2-tools.git new-slot-map
Joel Becker
2008-Jan-03 13:41 UTC
[Ocfs2-devel] [PATCH 3/7] ocfs2: Change the recovery map to an array of node numbers.
The old recovery map was a bitmap of node numbers. This was sufficient for the maximum node number of 254. Going forward, we want node numbers to be UINT32. Thus, we need a new recovery map. Note that we can't keep track of slots here. We must write down the node number to recovery *before* we get the locks needed to convert a node number into a slot number. The recovery map is now an array of unsigned ints, max_slots in size. It moves to journal.c with the rest of recovery. Because it needs to be initialized, we move all of recovery initialization into a new function, ocfs2_recovery_init(). This actually cleans up ocfs2_initialize_super() a little as well. Following on, recovery cleaup becomes part of ocfs2_recovery_exit(). A number of node map functions are rendered obsolete and are removed. Finally, waiting on recovery is wrapped in a function rather than naked checks on the recovery_event. This is a cleanup from Mark. Signed-off-by: Joel Becker <joel.becker@oracle.com> --- fs/ocfs2/dlmglue.c | 6 +- fs/ocfs2/heartbeat.c | 111 ------------------------------ fs/ocfs2/heartbeat.h | 18 ----- fs/ocfs2/journal.c | 182 +++++++++++++++++++++++++++++++++++++++++++++---- fs/ocfs2/journal.h | 4 + fs/ocfs2/ocfs2.h | 3 +- fs/ocfs2/super.c | 33 ++------- 7 files changed, 183 insertions(+), 174 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c2ebd72..828f1dd 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1753,8 +1753,7 @@ int ocfs2_meta_lock_full(struct inode *inode, goto local; if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) - wait_event(osb->recovery_event, - ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + ocfs2_wait_for_recovery(osb); lockres = &OCFS2_I(inode)->ip_meta_lockres; level = ex ? LKM_EXMODE : LKM_PRMODE; @@ -1777,8 +1776,7 @@ int ocfs2_meta_lock_full(struct inode *inode, * committed to owning this lock so we don't allow signals to * abort the operation. */ if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) - wait_event(osb->recovery_event, - ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + ocfs2_wait_for_recovery(osb); local: /* diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index c0efd94..8e3eac8 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -48,16 +48,10 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, int bit); static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, int bit); -static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); -static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, - struct ocfs2_node_map *from); -static void __ocfs2_node_map_set(struct ocfs2_node_map *target, - struct ocfs2_node_map *from); void ocfs2_init_node_maps(struct ocfs2_super *osb) { spin_lock_init(&osb->node_map_lock); - ocfs2_node_map_init(&osb->recovery_map); ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); } @@ -196,108 +190,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb, return ret; } -static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) -{ - int bit; - bit = find_next_bit(map->map, map->num_nodes, 0); - if (bit < map->num_nodes) - return 0; - return 1; -} - -int ocfs2_node_map_is_empty(struct ocfs2_super *osb, - struct ocfs2_node_map *map) -{ - int ret; - BUG_ON(map->num_nodes == 0); - spin_lock(&osb->node_map_lock); - ret = __ocfs2_node_map_is_empty(map); - spin_unlock(&osb->node_map_lock); - return ret; -} - -static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, - struct ocfs2_node_map *from) -{ - BUG_ON(from->num_nodes == 0); - ocfs2_node_map_init(target); - __ocfs2_node_map_set(target, from); -} - -/* returns 1 if bit is the only bit set in target, 0 otherwise */ -int ocfs2_node_map_is_only(struct ocfs2_super *osb, - struct ocfs2_node_map *target, - int bit) -{ - struct ocfs2_node_map temp; - int ret; - - spin_lock(&osb->node_map_lock); - __ocfs2_node_map_dup(&temp, target); - __ocfs2_node_map_clear_bit(&temp, bit); - ret = __ocfs2_node_map_is_empty(&temp); - spin_unlock(&osb->node_map_lock); - - return ret; -} - -static void __ocfs2_node_map_set(struct ocfs2_node_map *target, - struct ocfs2_node_map *from) -{ - int num_longs, i; - - BUG_ON(target->num_nodes != from->num_nodes); - BUG_ON(target->num_nodes == 0); - - num_longs = BITS_TO_LONGS(target->num_nodes); - for (i = 0; i < num_longs; i++) - target->map[i] = from->map[i]; -} - -/* Returns whether the recovery bit was actually set - it may not be - * if a node is still marked as needing recovery */ -int ocfs2_recovery_map_set(struct ocfs2_super *osb, - int num) -{ - int set = 0; - - spin_lock(&osb->node_map_lock); - - if (!test_bit(num, osb->recovery_map.map)) { - __ocfs2_node_map_set_bit(&osb->recovery_map, num); - set = 1; - } - - spin_unlock(&osb->node_map_lock); - - return set; -} - -void ocfs2_recovery_map_clear(struct ocfs2_super *osb, - int num) -{ - ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); -} - -int ocfs2_node_map_iterate(struct ocfs2_super *osb, - struct ocfs2_node_map *map, - int idx) -{ - int i = idx; - - idx = O2NM_INVALID_NODE_NUM; - spin_lock(&osb->node_map_lock); - if ((i != O2NM_INVALID_NODE_NUM) && - (i >= 0) && - (i < map->num_nodes)) { - while(i < map->num_nodes) { - if (test_bit(i, map->map)) { - idx = i; - break; - } - i++; - } - } - spin_unlock(&osb->node_map_lock); - return idx; -} diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index 5685921..2d65f1c 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h @@ -34,8 +34,6 @@ void ocfs2_stop_heartbeat(struct ocfs2_super *osb); /* node map functions - used to keep track of mounted and in-recovery * nodes. */ void ocfs2_node_map_init(struct ocfs2_node_map *map); -int ocfs2_node_map_is_empty(struct ocfs2_super *osb, - struct ocfs2_node_map *map); void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit); @@ -45,21 +43,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, int ocfs2_node_map_test_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit); -int ocfs2_node_map_iterate(struct ocfs2_super *osb, - struct ocfs2_node_map *map, - int idx); -static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, - struct ocfs2_node_map *map) -{ - return ocfs2_node_map_iterate(osb, map, 0); -} -int ocfs2_recovery_map_set(struct ocfs2_super *osb, - int num); -void ocfs2_recovery_map_clear(struct ocfs2_super *osb, - int num); -/* returns 1 if bit is the only bit set in target, 0 otherwise */ -int ocfs2_node_map_is_only(struct ocfs2_super *osb, - struct ocfs2_node_map *target, - int bit); #endif /* OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 322b981..d512a5c 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -64,6 +64,138 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, int slot); static int ocfs2_commit_thread(void *arg); + +/* + * The recovery_list is a simple linked list of node numbers to recover. + * It is protected by the recovery_lock. + */ + +struct ocfs2_recovery_map { + int rm_used; + unsigned int *rm_entries; +}; + +int ocfs2_recovery_init(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + mutex_init(&osb->recovery_lock); + osb->disable_recovery = 0; + osb->recovery_thread_task = NULL; + init_waitqueue_head(&osb->recovery_event); + + rm = kzalloc(sizeof(struct ocfs2_recovery_map) + + osb->max_slots * sizeof(unsigned int), + GFP_KERNEL); + if (!rm) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + rm->rm_entries = (unsigned int *)((char *)rm + sizeof(struct ocfs2_recovery_map)); + osb->recovery_map = rm; + + return 0; +} + +/* we can't grab the goofy sem lock from inside wait_event, so we use + * memory barriers to make sure that we'll see the null task before + * being woken up */ +static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) +{ + mb(); + return osb->recovery_thread_task != NULL; +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + mutex_lock(&osb->recovery_lock); + osb->disable_recovery = 1; + mutex_unlock(&osb->recovery_lock); + wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); + + /* At this point, we know that no more recovery threads can be + * launched, so wait for any recovery completion work to + * complete. */ + flush_workqueue(ocfs2_wq); + + /* + * Now that recovery is shut down, and the osb is about to be + * freed, the osb_lock is not taken here. + */ + rm = osb->recovery_map; + /* XXX: Should we bug if there are dirty entries? */ + + if (rm) + kfree(rm); +} + +/* Behaves like test-and-set. Returns the previous value */ +static int __ocfs2_recovery_map_test(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + assert_spin_locked(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) { + return 1; + } + } + + return 0; +} + +static int ocfs2_recovery_map_set(struct ocfs2_super *osb, + unsigned int node_num) +{ + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); + return 1; + } + + /* XXX: Can this be exploited? Not from o2dlm... */ + BUG_ON(rm->rm_used >= osb->max_slots); + + rm->rm_entries[rm->rm_used] = node_num; + rm->rm_used++; + spin_unlock(&osb->osb_lock); + + return 0; +} + +static void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + unsigned int node_num) +{ + int i; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + + for (i = 0; i < rm->rm_used; i++) { + if (rm->rm_entries[i] == node_num) + break; + } + + if (i < rm->rm_used) { + /* XXX: be careful with the pointer math */ + memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]), + (rm->rm_used - i - 1) * sizeof(unsigned int)); + rm->rm_used--; + } + + spin_unlock(&osb->osb_lock); +} + static int ocfs2_commit_cache(struct ocfs2_super *osb) { int status = 0; @@ -636,6 +768,23 @@ bail: return status; } +static int ocfs2_recovery_completed(struct ocfs2_super *osb) +{ + int empty; + struct ocfs2_recovery_map *rm = osb->recovery_map; + + spin_lock(&osb->osb_lock); + empty = (rm->rm_used == 0); + spin_unlock(&osb->osb_lock); + + return empty; +} + +void ocfs2_wait_for_recovery(struct ocfs2_super *osb) +{ + wait_event(osb->recovery_event, ocfs2_recovery_completed(osb)); +} + /* * JBD Might read a cached version of another nodes journal file. We * don't want this as this file changes often and we get no @@ -834,6 +983,7 @@ static int __ocfs2_recovery_thread(void *arg) { int status, node_num; struct ocfs2_super *osb = arg; + struct ocfs2_recovery_map *rm = osb->recovery_map; mlog_entry_void(); @@ -849,26 +999,29 @@ restart: goto bail; } - while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { - node_num = ocfs2_node_map_first_set_bit(osb, - &osb->recovery_map); - if (node_num == O2NM_INVALID_NODE_NUM) { - mlog(0, "Out of nodes to recover.\n"); - break; - } + spin_lock(&osb->osb_lock); + while (rm->rm_used) { + /* It's always safe to remove entry zero, as we won't + * clear it until ocfs2_recover_node() has succeeded. */ + node_num = rm->rm_entries[0]; + spin_unlock(&osb->osb_lock); status = ocfs2_recover_node(osb, node_num); - if (status < 0) { + if (!status) { + ocfs2_recovery_map_clear(osb, node_num); + } else { mlog(ML_ERROR, "Error %d recovering node %d on device (%u,%u)!\n", status, node_num, MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); mlog(ML_ERROR, "Volume requires unmount.\n"); - continue; } - ocfs2_recovery_map_clear(osb, node_num); + spin_lock(&osb->osb_lock); } + spin_unlock(&osb->osb_lock); + mlog(0, "All nodes recovered\n"); + ocfs2_super_unlock(osb, 1); /* We always run recovery on our own orphan dir - the dead @@ -879,8 +1032,7 @@ restart: bail: mutex_lock(&osb->recovery_lock); - if (!status && - !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { + if (!status && !ocfs2_recovery_completed(osb)) { mutex_unlock(&osb->recovery_lock); goto restart; } @@ -910,8 +1062,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) /* People waiting on recovery will wait on * the recovery map to empty. */ - if (!ocfs2_recovery_map_set(osb, node_num)) - mlog(0, "node %d already be in recovery.\n", node_num); + if (ocfs2_recovery_map_set(osb, node_num)) + mlog(0, "node %d already in recovery map.\n", node_num); mlog(0, "starting recovery thread...\n"); @@ -1183,7 +1335,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) if (status == -ENOENT) continue; - if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) + if (__ocfs2_recovery_map_test(osb, node_num)) continue; spin_unlock(&osb->osb_lock); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 4b32e09..11008a2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_complete_recovery(struct work_struct *work); +void ocfs2_wait_for_recovery(struct ocfs2_super *osb); + +int ocfs2_recovery_init(struct ocfs2_super *osb); +void ocfs2_recovery_exit(struct ocfs2_super *osb); /* * Journal Control: diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 67867b4..96674f7 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -179,6 +179,7 @@ enum ocfs2_mount_options struct ocfs2_journal; struct ocfs2_slot_info; +struct ocfs2_recovery_map; struct ocfs2_super { struct task_struct *commit_task; @@ -190,7 +191,6 @@ struct ocfs2_super struct ocfs2_slot_info *slot_info; spinlock_t node_map_lock; - struct ocfs2_node_map recovery_map; u64 root_blkno; u64 system_dir_blkno; @@ -225,6 +225,7 @@ struct ocfs2_super atomic_t vol_state; struct mutex recovery_lock; + struct ocfs2_recovery_map *recovery_map; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 012b555..2142985 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1166,15 +1166,6 @@ leave: return status; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ -static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) -{ - mb(); - return osb->recovery_thread_task != NULL; -} - static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) { int tmp; @@ -1191,17 +1182,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_truncate_log_shutdown(osb); - /* disable any new recovery threads and wait for any currently - * running ones to exit. Do this before setting the vol_state. */ - mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; - mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ - flush_workqueue(ocfs2_wq); + /* This will disable recovery and flush any recovery work. */ + ocfs2_recovery_exit(osb); ocfs2_journal_shutdown(osb); @@ -1310,7 +1292,6 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); - init_waitqueue_head(&osb->recovery_event); spin_lock_init(&osb->dc_task_lock); init_waitqueue_head(&osb->dc_event); osb->dc_work_sequence = 0; @@ -1330,10 +1311,12 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); - mutex_init(&osb->recovery_lock); - - osb->disable_recovery = 0; - osb->recovery_thread_task = NULL; + status = ocfs2_recovery_init(osb); + if (status) { + mlog(ML_ERROR, "Unable to initialize recovery state\n"); + mlog_errno(status); + goto bail; + } init_waitqueue_head(&osb->checkpoint_event); atomic_set(&osb->needs_checkpoint, 0); -- 1.5.2.2