The following patches apply some lessons I've learned during some testing of a couple non-trivial workloads on Ocfs2. Roughly speaking, they tune "up" the default local alloc windowxs, and tune "down" the reservations code a bit. I primarily ran two tests - one simulating a large multi-node data-processing workload, and the multi-threaded writers test, mtwrite at: http://www.kernel.org/pub/linux/kernel/people/mfasheh/ocfs2/tests/mtwrite.c All patches were developed and tested on top of the current set of allocation changes in ocfs2.git (primarily reservations, and the local alloc fixes we have). A git branch containing all the patches is available, based off 2.6.33: git pull git://git.kernel.org/pub/scm/linux/kernel/git/ocfs2-mark.git disk-alloc Basically though, this all involved many many test runs (of which I have lots of data) and a thorough checking of fragmentation levels. I changed only one value at a time. Details regarding some of the test runs are within the specific patch descriptions. --Mark -- Mark Fasheh
Mark Fasheh
2010-Apr-06 01:17 UTC
[Ocfs2-devel] [PATCH 1/4] ocfs2: clean up localalloc mount option size parsing
This patch pulls the local alloc sizing code into localalloc.c and provides
a callout to it from ocfs2_fill_super(). Behavior is essentially unchanged
except that I correctly calculate the maximum local alloc size. The old code
in ocfs2_parse_options() calculated the max size as:
ocfs2_local_alloc_size(sb) * 8
which is correct, in bits. Unfortunately though the option passed in is in
megabytes. Ultimately, this bug made no real difference - the shrink code
would catch a too-large size and bring it down to something reasonable.
Still, it's less than efficient as-is.
Signed-off-by: Mark Fasheh <mfasheh at suse.com>
---
fs/ocfs2/localalloc.c | 28 ++++++++++++++++++++++++++++
fs/ocfs2/localalloc.h | 2 ++
fs/ocfs2/ocfs2.h | 6 ++++++
fs/ocfs2/super.c | 10 +++++-----
4 files changed, 41 insertions(+), 5 deletions(-)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 18fc176..9dc34a6 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,6 +75,34 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super
*osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
+{
+ struct super_block *sb = osb->sb;
+ unsigned int la_default_mb = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ unsigned int la_max_mb;
+
+ la_max_mb = ocfs2_clusters_to_megabytes(sb,
+ ocfs2_local_alloc_size(sb) * 8);
+
+ mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+ requested_mb, la_max_mb, la_default_mb);
+
+ if (requested_mb == -1) {
+ /* No user request - use defaults */
+ osb->local_alloc_default_bits + ocfs2_megabytes_to_clusters(sb,
la_default_mb);
+ } else if (requested_mb > la_max_mb) {
+ /* Request is too big, we give the maximum available */
+ osb->local_alloc_default_bits + ocfs2_megabytes_to_clusters(sb,
la_max_mb);
+ } else {
+ osb->local_alloc_default_bits + ocfs2_megabytes_to_clusters(sb,
requested_mb);
+ }
+
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+}
+
static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
{
return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index ac5ea9f..04195c6 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -30,6 +30,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
+void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
struct ocfs2_dinode **alloc_copy);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 4a6f00e..44b95a9 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -765,6 +765,12 @@ static inline unsigned int
ocfs2_megabytes_to_clusters(struct super_block *sb,
return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
}
+static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
+ unsigned int clusters)
+{
+ return clusters >> (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
{
spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index db354d1..6dfcb4e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -93,7 +93,7 @@ struct mount_options
unsigned long mount_opt;
unsigned int atime_quantum;
signed short slot;
- unsigned int localalloc_opt;
+ int localalloc_opt;
unsigned int resv_level;
char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
};
@@ -1027,8 +1027,8 @@ static int ocfs2_fill_super(struct super_block *sb, void
*data, int silent)
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
- osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb,
parsed_options.localalloc_opt);
- osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+ ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
osb->osb_resv_level = parsed_options.resv_level;
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
@@ -1288,7 +1288,7 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt = 0;
mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
mopt->slot = OCFS2_INVALID_SLOT;
- mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ mopt->localalloc_opt = -1;
mopt->cluster_stack[0] = '\0';
mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
@@ -1381,7 +1381,7 @@ static int ocfs2_parse_options(struct super_block *sb,
status = 0;
goto bail;
}
- if (option >= 0 && (option <= ocfs2_local_alloc_size(sb) * 8))
+ if (option >= 0)
mopt->localalloc_opt = option;
break;
case Opt_localflocks:
--
1.6.4.2
Mark Fasheh
2010-Apr-06 01:17 UTC
[Ocfs2-devel] [PATCH 2/4] ocfs2: increase the default size of local alloc windows
I have observed that the current size of 8M gives us pretty poor
fragmentation on multi-threaded workloads which do lots of writes.
Generally, I can increase the size of local alloc windows and observe a
marked decrease in fragmentation, even up and beyond window sizes of 512
megabytes. This makes sense for a couple reasons - larger local alloc means
more room for reservation windows. On multi-node workloads the larger local
alloc helps as well because we don't have to do window slides as often.
Also, I removed the OCFS2_DEFAULT_LOCAL_ALLOC_SIZE constant as it is no
longer used and the comment above it was out of date.
To test fragmentation, I used a workload which launched 4 threads that did
4k writes into a series of about 140 alternating files.
With resv_level=2, and a 4k/4k file system I observed the following average
fragmentation for various localalloc= parameters:
localalloc= avg. fragmentation
8 48
32 16
64 10
120 7
On larger cluster sizes, the difference is more dramatic.
The new default size top out at 256M, which we'll only get for cluster
sizes of 32K and above.
Signed-off-by: Mark Fasheh <mfasheh at suse.com>
---
fs/ocfs2/localalloc.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-
fs/ocfs2/localalloc.h | 1 +
fs/ocfs2/ocfs2.h | 3 +
fs/ocfs2/ocfs2_fs.h | 8 ---
fs/ocfs2/super.c | 3 +-
5 files changed, 118 insertions(+), 11 deletions(-)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 9dc34a6..9b70a02 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -75,10 +75,120 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super
*osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
+/*
+ * ocfs2_la_default_mb() - determine a default size, in megabytes of
+ * the local alloc.
+ *
+ * Generally, we'd like to pick as large a local alloc as
+ * possible. Performance on large workloads tends to scale
+ * proportionally to la size. In addition to that, the reservations
+ * code functions more efficiently as it can reserve more windows for
+ * write.
+ *
+ * Some things work against us when trying to choose a large local alloc:
+ *
+ * - We need to ensure our sizing is picked to leave enough space in
+ * group descriptors for other allocations (such as block groups,
+ * etc). Picking default sizes which are a multiple of 4 could help
+ * - block groups are allocated in 2mb and 4mb chunks.
+ *
+ * - Likewise, we don't want to starve other nodes of bits on small
+ * file systems. This can easily be taken care of by limiting our
+ * default to a reasonable size (256M) on larger cluster sizes.
+ *
+ * - Some file systems can't support very large sizes - 4k and 8k in
+ * particular are limited to less than 128 and 256 megabytes respectively.
+ *
+ * The following reference table shows group descriptor and local
+ * alloc maximums at various cluster sizes (4k blocksize)
+ *
+ * csize: 4K group: 126M la: 121M
+ * csize: 8K group: 252M la: 243M
+ * csize: 16K group: 504M la: 486M
+ * csize: 32K group: 1008M la: 972M
+ * csize: 64K group: 2016M la: 1944M
+ * csize: 128K group: 4032M la: 3888M
+ * csize: 256K group: 8064M la: 7776M
+ * csize: 512K group: 16128M la: 15552M
+ * csize: 1024K group: 32256M la: 31104M
+ */
+#define OCFS2_LA_MAX_DEFAULT_MB 256
+#define OCFS2_LA_OLD_DEFAULT 8
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
+{
+ unsigned int la_mb;
+ unsigned int gd_mb;
+ unsigned int megs_per_slot;
+ struct super_block *sb = osb->sb;
+
+ gd_mb = ocfs2_clusters_to_megabytes(osb->sb,
+ 8 * ocfs2_group_bitmap_size(sb));
+
+ /*
+ * This takes care of files systems with very small group
+ * descriptors - 512 byte blocksize at cluster sizes lower
+ * than 16K and also 1k blocksize with 4k cluster size.
+ */
+ if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
+ || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096))
+ return OCFS2_LA_OLD_DEFAULT;
+
+ /*
+ * Leave enough room for some block groups and make the final
+ * value we work from a multiple of 4.
+ */
+ gd_mb -= 16;
+ gd_mb &= 0xFFFFFFFB;
+
+ la_mb = gd_mb;
+
+ /*
+ * Keep window sizes down to a reasonable default
+ */
+ if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) {
+ /*
+ * Some clustersize / blocksize combinations will have
+ * given us a larger than OCFS2_LA_MAX_DEFAULT_MB
+ * default size, but get poor distribution when
+ * limited to exactly 256 megabytes.
+ *
+ * As an example, 16K clustersize at 4K blocksize
+ * gives us a cluster group size of 504M. Paring the
+ * local alloc size down to 256 however, would give us
+ * only one window and around 200MB left in the
+ * cluster group. Instead, find the first size below
+ * 256 which would give us an even distribution.
+ *
+ * Larger cluster group sizes actually work out pretty
+ * well when pared to 256, so we don't have to do this
+ * for any group that fits more than two
+ * OCFS2_LA_MAX_DEFAULT_MB windows.
+ */
+ if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
+ la_mb = 256;
+ else {
+ unsigned int gd_mult = gd_mb;
+
+ while (gd_mult > 256)
+ gd_mult = gd_mult >> 1;
+
+ la_mb = gd_mult;
+ }
+ }
+
+ megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
+ megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot);
+ /* Too many nodes, too few disk clusters. */
+ if (megs_per_slot < la_mb)
+ la_mb = megs_per_slot;
+
+ return la_mb;
+}
+
void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
{
struct super_block *sb = osb->sb;
- unsigned int la_default_mb = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ unsigned int la_default_mb = ocfs2_la_default_mb(osb);
unsigned int la_max_mb;
la_max_mb = ocfs2_clusters_to_megabytes(sb,
@@ -185,7 +295,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
osb->local_alloc_bits, (osb->bitmap_cpg - 1));
osb->local_alloc_bits ocfs2_megabytes_to_clusters(osb->sb,
- OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
+ ocfs2_la_default_mb(osb));
}
/* read the alloc off disk */
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 04195c6..1be9b58 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -31,6 +31,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb);
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb);
+unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb);
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
int node_num,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 44b95a9..72e6eef 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -339,6 +339,9 @@ struct ocfs2_super
*/
unsigned int local_alloc_bits;
unsigned int local_alloc_default_bits;
+ /* osb_clusters_at_boot can become stale! Do not trust it to
+ * be up to date. */
+ unsigned int osb_clusters_at_boot;
enum ocfs2_local_alloc_state local_alloc_state; /* protected
* by osb_lock */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7638a38..d58bc76 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -340,14 +340,6 @@ struct reflink_arguments {
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
/*
- * Default local alloc size (in megabytes)
- *
- * The value chosen should be such that most allocations, including new
- * block groups, use local alloc.
- */
-#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
-
-/*
* Inline extended attribute size (in bytes)
* The value chosen should be aligned to 16 byte boundaries.
*/
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6dfcb4e..bca1ed3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1499,7 +1499,7 @@ static int ocfs2_show_options(struct seq_file *s, struct
vfsmount *mnt)
(unsigned) (osb->osb_commit_interval / HZ));
local_alloc_megs = osb->local_alloc_bits >> (20 -
osb->s_clustersize_bits);
- if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ if (local_alloc_megs != ocfs2_la_default_mb(osb))
seq_printf(s, ",localalloc=%d", local_alloc_megs);
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
@@ -2247,6 +2247,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
+ osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters;
iput(inode);
osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8;
--
1.6.4.2
Mark Fasheh
2010-Apr-06 01:17 UTC
[Ocfs2-devel] [PATCH 3/4] ocfs2: change default reservation window sizes
The default reservation size of 4 (32-bit windows) is a bit too ambitious.
Scale it back to 16 bits (resv_level=2). I have been testing various sizes
on a 4-node cluster which runs a mixed workload that is heavily threaded.
With a 256MB local alloc, I get *roughly* the following levels of average file
fragmentation:
resv_level=0 70%
resv_level=1 21%
resv_level=2 23%
resv_level=3 24%
resv_level=4 60%
resv_level=5 did not test
resv_level=6 60%
resv_level=2 seemed like a good compromise between not letting windows be
too small, but not so big that heavier workloads will immediately suffer
without tuning.
This patch also change the behavior of directory reservations - they now
track file reservations. The previous compromise of giving directory
windows only 8 bits wound up fragmenting more at some window sizes because
file allocations had smaller unused windows to poach from.
Signed-off-by: Mark Fasheh <mfasheh at suse.com>
---
Documentation/filesystems/ocfs2.txt | 2 +-
fs/ocfs2/reservations.c | 7 ++++---
fs/ocfs2/reservations.h | 2 +-
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/Documentation/filesystems/ocfs2.txt
b/Documentation/filesystems/ocfs2.txt
index 412df90..32339e5 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -80,6 +80,6 @@ user_xattr (*) Enables Extended User Attributes.
nouser_xattr Disables Extended User Attributes.
acl Enables POSIX Access Control Lists support.
noacl (*) Disables POSIX Access Control Lists support.
-resv_level=4 (*) Set how agressive allocation reservations will be.
+resv_level=2 (*) Set how agressive allocation reservations will be.
Valid values are between 0 (reservations off) to 8
(maximum space for reservations).
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 7fc6cfe..87fa357 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -55,9 +55,10 @@ static unsigned int ocfs2_resv_window_bits(struct
ocfs2_reservation_map *resmap,
if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) {
/* 8, 16, 32, 64, 128, 256, 512, 1024 */
bits = 4 << osb->osb_resv_level;
- } else
- bits = OCFS2_RESV_DIR_WINDOW_BITS;
-
+ } else {
+ /* For now, treat directories the same as files. */
+ bits = 4 << osb->osb_resv_level;
+ }
return bits;
}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 34bb308..022aff6 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -22,7 +22,7 @@
#include <linux/rbtree.h>
-#define OCFS2_DEFAULT_RESV_LEVEL 4
+#define OCFS2_DEFAULT_RESV_LEVEL 2
#define OCFS2_MAX_RESV_LEVEL 9
#define OCFS2_MIN_RESV_LEVEL 0
--
1.6.4.2
Mark Fasheh
2010-Apr-06 01:17 UTC
[Ocfs2-devel] [PATCH 4/4] ocfs2: Add dir_resv_level mount option
The default behavior for directory reservations stays the same, but we add a
mount option so people can tweak the size of directory reservations
according to their workloads.
Signed-off-by: Mark Fasheh <mfasheh at suse.com>
---
Documentation/filesystems/ocfs2.txt | 4 ++++
fs/ocfs2/dir.c | 6 ++++--
fs/ocfs2/ocfs2.h | 1 +
fs/ocfs2/reservations.c | 9 ++++++---
fs/ocfs2/reservations.h | 2 ++
fs/ocfs2/super.c | 23 +++++++++++++++++++++++
6 files changed, 40 insertions(+), 5 deletions(-)
diff --git a/Documentation/filesystems/ocfs2.txt
b/Documentation/filesystems/ocfs2.txt
index 32339e5..1f7ae14 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -83,3 +83,7 @@ noacl (*) Disables POSIX Access Control Lists support.
resv_level=2 (*) Set how agressive allocation reservations will be.
Valid values are between 0 (reservations off) to 8
(maximum space for reservations).
+dir_resv_level= (*) By default, directory reservations will scale with file
+ reservations - users should rarely need to change this
+ value. If allocation reservations are turned off, this
+ option will have no effect.
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b71acf3..dce39da 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2993,7 +2993,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir,
struct buffer_head *di_bh,
* if we only get one now, that's enough to continue. The rest
* will be claimed after the conversion to extents.
*/
- data_ac->ac_resv = &oi->ip_la_data_resv;
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &oi->ip_la_data_resv;
ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
@@ -3373,7 +3374,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
goto bail;
}
- data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
+ if (ocfs2_dir_resv_allowed(osb))
+ data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
credits = ocfs2_calc_extend_credits(sb, el, 1);
} else {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 72e6eef..e946936 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -353,6 +353,7 @@ struct ocfs2_super
struct ocfs2_reservation_map osb_la_resmap;
unsigned int osb_resv_level;
+ unsigned int osb_dir_resv_level;
/* Next three fields are for local node slot recovery during
* mount. */
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 87fa357..6497bcc 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -44,7 +44,11 @@ DEFINE_SPINLOCK(resv_lock);
#define OCFS2_MIN_RESV_WINDOW_BITS 8
#define OCFS2_MAX_RESV_WINDOW_BITS 1024
-#define OCFS2_RESV_DIR_WINDOW_BITS OCFS2_MIN_RESV_WINDOW_BITS
+
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb)
+{
+ return (osb->osb_resv_level && osb->osb_dir_resv_level);
+}
static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map
*resmap,
struct ocfs2_alloc_reservation *resv)
@@ -56,8 +60,7 @@ static unsigned int ocfs2_resv_window_bits(struct
ocfs2_reservation_map *resmap,
/* 8, 16, 32, 64, 128, 256, 512, 1024 */
bits = 4 << osb->osb_resv_level;
} else {
- /* For now, treat directories the same as files. */
- bits = 4 << osb->osb_resv_level;
+ bits = 4 << osb->osb_dir_resv_level;
}
return bits;
}
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 022aff6..25b0c0e 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -67,6 +67,8 @@ void ocfs2_resv_init_once(struct ocfs2_alloc_reservation
*resv);
void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv,
unsigned int flags);
+int ocfs2_dir_resv_allowed(struct ocfs2_super *osb);
+
/**
* ocfs2_resv_discard() - truncate a reservation
* @resmap:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bca1ed3..9f0e211 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -95,6 +95,7 @@ struct mount_options
signed short slot;
int localalloc_opt;
unsigned int resv_level;
+ int dir_resv_level;
char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
};
@@ -177,6 +178,7 @@ enum {
Opt_usrquota,
Opt_grpquota,
Opt_resv_level,
+ Opt_dir_resv_level,
Opt_err,
};
@@ -204,6 +206,7 @@ static const match_table_t tokens = {
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
{Opt_resv_level, "resv_level=%u"},
+ {Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_err, NULL}
};
@@ -1030,6 +1033,11 @@ static int ocfs2_fill_super(struct super_block *sb, void
*data, int silent)
ocfs2_la_set_sizes(osb, parsed_options.localalloc_opt);
osb->osb_resv_level = parsed_options.resv_level;
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ if (parsed_options.dir_resv_level == -1)
+ osb->osb_dir_resv_level = parsed_options.resv_level;
+ else
+ osb->osb_dir_resv_level = parsed_options.dir_resv_level;
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -1291,6 +1299,7 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->localalloc_opt = -1;
mopt->cluster_stack[0] = '\0';
mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL;
+ mopt->dir_resv_level = -1;
if (!options) {
status = 1;
@@ -1445,6 +1454,17 @@ static int ocfs2_parse_options(struct super_block *sb,
option < OCFS2_MAX_RESV_LEVEL)
mopt->resv_level = option;
break;
+ case Opt_dir_resv_level:
+ if (is_remount)
+ break;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option >= OCFS2_MIN_RESV_LEVEL &&
+ option < OCFS2_MAX_RESV_LEVEL)
+ mopt->dir_resv_level = option;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1529,6 +1549,9 @@ static int ocfs2_show_options(struct seq_file *s, struct
vfsmount *mnt)
if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL)
seq_printf(s, ",resv_level=%d", osb->osb_resv_level);
+ if (osb->osb_dir_resv_level != osb->osb_resv_level)
+ seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+
return 0;
}
--
1.6.4.2
On Mon, Apr 05, 2010 at 06:17:12PM -0700, Mark Fasheh wrote:> The following patches apply some lessons I've learned during some testing of > a couple non-trivial workloads on Ocfs2. Roughly speaking, they tune "up" > the default local alloc windowxs, and tune "down" the reservations code a > bit. > > I primarily ran two tests - one simulating a large multi-node data-processing > workload, and the multi-threaded writers test, mtwrite at: > > http://www.kernel.org/pub/linux/kernel/people/mfasheh/ocfs2/tests/mtwrite.c > > All patches were developed and tested on top of the current set of > allocation changes in ocfs2.git (primarily reservations, and the local alloc > fixes we have). A git branch containing all the patches is available, based > off 2.6.33: > > git pull git://git.kernel.org/pub/scm/linux/kernel/git/ocfs2-mark.git disk-alloc > > Basically though, this all involved many many test runs (of which I have > lots of data) and a thorough checking of fragmentation levels. I changed > only one value at a time. Details regarding some of the test runs are within > the specific patch descriptions.These patches are now in the merge-window branch of ocfs2.git. Joel -- "It is not the function of our government to keep the citizen from falling into error; it is the function of the citizen to keep the government from falling into error." - Robert H. Jackson Joel Becker Principal Software Developer Oracle E-mail: joel.becker at oracle.com Phone: (650) 506-8127