Tao Ma
2011-May-06 09:23 UTC
[Ocfs2-devel] [PATCH 0/3] ocfs2: Add batched discard support.
Hi all, These are the patches for adding batched discard support in ocfs2. I have tested it with xfstests 251 and it passed. btw, I have also run some tests against it(bonnie++, postmark, ffsb and fs_mark) and there are no big difference before and after the discard. Regards, Tao
Tao Ma
2011-May-06 09:27 UTC
[Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
From: Tao Ma <boyu.mt at taobao.com>
Add ocfs2_trim_fs to support trimming freed clusters in the
volume. A range will be given and all the freed clusters greater
than minlen will be discarded to the block layer.
Signed-off-by: Tao Ma <boyu.mt at taobao.com>
---
fs/ocfs2/alloc.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ocfs2/alloc.h | 1 +
2 files changed, 157 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 48aa9c7..93a3f92 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -29,6 +29,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include <cluster/masklog.h>
@@ -7184,3 +7185,158 @@ out_commit:
out:
return ret;
}
+
+static int ocfs2_trim_extent(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ int start, int count)
+{
+ u64 discard;
+
+ count = ocfs2_clusters_to_blocks(sb, count);
+ discard = le64_to_cpu(gd->bg_blkno) +
+ ocfs2_clusters_to_blocks(sb, start);
+
+ return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
+}
+
+static int ocfs2_trim_group(struct super_block *sb,
+ struct ocfs2_group_desc *gd,
+ int start, int max, int minbits)
+{
+ int ret = 0, count = 0, next;
+ void *bitmap = gd->bg_bitmap;
+
+ while (start < max) {
+ start = ocfs2_find_next_zero_bit(bitmap, max, start);
+ if (start >= max)
+ break;
+ next = ocfs2_find_next_bit(bitmap, max, start);
+
+ if ((next - start) >= minbits) {
+ ret = ocfs2_trim_extent(sb, gd,
+ start, next - start);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ count += next - start;
+ }
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
+ break;
+ }
+
+ if (ret < 0)
+ count = ret;
+
+ return count;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 start, len, minlen, trimmed, first_group, last_group, group;
+ int ret, cnt, first_bit, last_bit;
+ struct buffer_head *main_bm_bh = NULL;
+ struct inode *main_bm_inode = NULL;
+ struct buffer_head *gd_bh = NULL;
+ struct ocfs2_dinode *main_bm;
+ struct ocfs2_group_desc *gd = NULL;
+
+ start = range->start >> osb->s_clustersize_bits;
+ len = range->len >> osb->s_clustersize_bits;
+ minlen = range->minlen >> osb->s_clustersize_bits;
+ trimmed = 0;
+
+ if (!len) {
+ range->len = 0;
+ return 0;
+ }
+
+ if (minlen >= osb->bitmap_cpg)
+ return -EINVAL;
+
+ main_bm_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!main_bm_inode) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&main_bm_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+ main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
+
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
+
+ /* Determine first and last group to examine based on start and len */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
+ last_bit = osb->bitmap_cpg;
+
+ for (group = first_group; group <= last_group;) {
+ if (first_bit + len >= osb->bitmap_cpg)
+ last_bit = osb->bitmap_cpg;
+ else
+ last_bit = first_bit + len;
+
+ ret = ocfs2_read_group_descriptor(main_bm_inode,
+ main_bm, group,
+ &gd_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+
+ gd = (struct ocfs2_group_desc *)gd_bh->b_data;
+ cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen);
+ brelse(gd_bh);
+ gd_bh = NULL;
+ if (cnt < 0) {
+ ret = cnt;
+ mlog_errno(ret);
+ break;
+ }
+
+ trimmed += cnt;
+ len -= osb->bitmap_cpg - first_bit;
+ first_bit = 0;
+ if (group == osb->first_cluster_group_blkno)
+ group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ else
+ group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
+ }
+ range->len = trimmed * sb->s_blocksize;
+out_unlock:
+ ocfs2_inode_unlock(main_bm_inode, 0);
+ brelse(main_bm_bh);
+out_mutex:
+ mutex_unlock(&main_bm_inode->i_mutex);
+ iput(main_bm_inode);
+out:
+ return ret;
+}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3bd08a0..ca381c5 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
struct buffer_head **leaf_bh);
int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range);
/*
* Helper function to look at the # of clusters in an extent record.
*/
--
1.6.3.GIT
From: Tao Ma <boyu.mt at taobao.com>
Add the corresponding ioctl function for FITRIM.
Signed-off-by: Tao Ma <boyu.mt at taobao.com>
---
fs/ocfs2/ioctl.c | 24 ++++++++++++++++++++++++
1 files changed, 24 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8f13c59..312a28f 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -542,6 +542,29 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 0);
+ case FITRIM:
+ {
+ struct super_block *sb = inode->i_sb;
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&range, (struct fstrim_range *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ ret = ocfs2_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
default:
return -ENOTTY;
}
@@ -569,6 +592,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd,
unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
+ case FITRIM:
break;
case OCFS2_IOC_REFLINK:
if (copy_from_user(&args, (struct reflink_arguments *)arg,
--
1.6.3.GIT
From: Tao Ma <boyu.mt at taobao.com>
Add the corresponding trace event for trim.
Signed-off-by: Tao Ma <boyu.mt at taobao.com>
---
fs/ocfs2/alloc.c | 7 +++++++
fs/ocfs2/ocfs2_trace.h | 25 +++++++++++++++++++++++++
2 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 93a3f92..12f1c33 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7196,6 +7196,8 @@ static int ocfs2_trim_extent(struct super_block *sb,
discard = le64_to_cpu(gd->bg_blkno) +
ocfs2_clusters_to_blocks(sb, start);
+ trace_ocfs2_trim_extent(sb, (unsigned long long)discard, count);
+
return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);
}
@@ -7206,6 +7208,9 @@ static int ocfs2_trim_group(struct super_block *sb,
int ret = 0, count = 0, next;
void *bitmap = gd->bg_bitmap;
+ trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
+ start, max, minbits);
+
while (start < max) {
start = ocfs2_find_next_zero_bit(bitmap, max, start);
if (start >= max)
@@ -7289,6 +7294,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct
fstrim_range *range)
if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start;
+ trace_ocfs2_trim_fs(start, len, minlen);
+
/* Determine first and last group to examine based on start and len */
first_group = ocfs2_which_cluster_group(main_bm_inode, start);
if (first_group == osb->first_cluster_group_blkno)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a1dae5b..9ab22a1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -688,6 +688,31 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
__entry->blkno, __entry->bit)
);
+TRACE_EVENT(ocfs2_trim_extent,
+ TP_PROTO(struct super_block *sb, unsigned long long blk,
+ unsigned long long count),
+ TP_ARGS(sb, blk, count),
+ TP_STRUCT__entry(
+ __field(int, dev_major)
+ __field(int, dev_minor)
+ __field(unsigned long long, blk)
+ __field(__u64, count)
+ ),
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(sb->s_dev);
+ __entry->dev_minor = MINOR(sb->s_dev);
+ __entry->blk = blk;
+ __entry->count = count;
+ ),
+ TP_printk("%d %d %llu %llu",
+ __entry->dev_major, __entry->dev_minor,
+ __entry->blk, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_trim_group);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
+
/* End of trace events for fs/ocfs2/alloc.c. */
/* Trace events for fs/ocfs2/localalloc.c. */
--
1.6.3.GIT
Sunil Mushran
2011-May-09 23:02 UTC
[Ocfs2-devel] [PATCH 1/3] ocfs2: Add ocfs2_trim_fs for SSD trim support.
On 05/06/2011 02:27 AM, Tao Ma wrote:> From: Tao Ma<boyu.mt at taobao.com> > > Add ocfs2_trim_fs to support trimming freed clusters in the > volume. A range will be given and all the freed clusters greater > than minlen will be discarded to the block layer. > > Signed-off-by: Tao Ma<boyu.mt at taobao.com> > --- > fs/ocfs2/alloc.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/ocfs2/alloc.h | 1 + > 2 files changed, 157 insertions(+), 0 deletions(-) > > diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c > index 48aa9c7..93a3f92 100644 > --- a/fs/ocfs2/alloc.c > +++ b/fs/ocfs2/alloc.c > @@ -29,6 +29,7 @@ > #include<linux/highmem.h> > #include<linux/swap.h> > #include<linux/quotaops.h> > +#include<linux/blkdev.h> > > #include<cluster/masklog.h> > > @@ -7184,3 +7185,158 @@ out_commit: > out: > return ret; > } > + > +static int ocfs2_trim_extent(struct super_block *sb, > + struct ocfs2_group_desc *gd, > + int start, int count)u32 will be better for start and count.> +{ > + u64 discard; > + > + count = ocfs2_clusters_to_blocks(sb, count);ocfs2_clusters_to_blocks() returns u64.> + discard = le64_to_cpu(gd->bg_blkno) + > + ocfs2_clusters_to_blocks(sb, start); > + > + return sb_issue_discard(sb, discard, count, GFP_NOFS, 0);> +} > + > +static int ocfs2_trim_group(struct super_block *sb, > + struct ocfs2_group_desc *gd, > + int start, int max, int minbits) > +{ > + int ret = 0, count = 0, next; > + void *bitmap = gd->bg_bitmap; > + > + while (start< max) { > + start = ocfs2_find_next_zero_bit(bitmap, max, start); > + if (start>= max) > + break; > + next = ocfs2_find_next_bit(bitmap, max, start); > + > + if ((next - start)>= minbits) { > + ret = ocfs2_trim_extent(sb, gd, > + start, next - start); > + if (ret< 0) { > + mlog_errno(ret); > + break; > + } > + count += next - start; > + } > + start = next + 1; > + > + if (fatal_signal_pending(current)) { > + count = -ERESTARTSYS; > + break; > + } > + > + if ((le16_to_cpu(gd->bg_free_bits_count) - count)< minbits) > + break;This check could also be done earlier.> + } > + > + if (ret< 0) > + count = ret; > + > + return count; > +} > + > +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) > +{ > + struct ocfs2_super *osb = OCFS2_SB(sb); > + u64 start, len, minlen, trimmed, first_group, last_group, group; > + int ret, cnt, first_bit, last_bit; > + struct buffer_head *main_bm_bh = NULL; > + struct inode *main_bm_inode = NULL; > + struct buffer_head *gd_bh = NULL; > + struct ocfs2_dinode *main_bm; > + struct ocfs2_group_desc *gd = NULL; > + > + start = range->start>> osb->s_clustersize_bits; > + len = range->len>> osb->s_clustersize_bits; > + minlen = range->minlen>> osb->s_clustersize_bits; > + trimmed = 0; > + > + if (!len) { > + range->len = 0; > + return 0; > + } > + > + if (minlen>= osb->bitmap_cpg) > + return -EINVAL; > + > + main_bm_inode = ocfs2_get_system_file_inode(osb, > + GLOBAL_BITMAP_SYSTEM_INODE, > + OCFS2_INVALID_SLOT); > + if (!main_bm_inode) { > + ret = -EIO; > + mlog_errno(ret); > + goto out; > + } > + > + mutex_lock(&main_bm_inode->i_mutex); > + > + ret = ocfs2_inode_lock(main_bm_inode,&main_bm_bh, 0); > + if (ret< 0) { > + mlog_errno(ret); > + goto out_mutex; > + } > + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; > + > + if (start>= le32_to_cpu(main_bm->i_clusters)) { > + ret = -EINVAL; > + mlog_errno(ret);User error. No need to log it.> + goto out_unlock; > + } > + > + if (start + len> le32_to_cpu(main_bm->i_clusters)) > + len = le32_to_cpu(main_bm->i_clusters) - start; > + > + /* Determine first and last group to examine based on start and len */ > + first_group = ocfs2_which_cluster_group(main_bm_inode, start); > + if (first_group == osb->first_cluster_group_blkno) > + first_bit = start; > + else > + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); > + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); > + last_bit = osb->bitmap_cpg; > + > + for (group = first_group; group<= last_group;) { > + if (first_bit + len>= osb->bitmap_cpg) > + last_bit = osb->bitmap_cpg; > + else > + last_bit = first_bit + len; > + > + ret = ocfs2_read_group_descriptor(main_bm_inode, > + main_bm, group, > + &gd_bh); > + if (ret< 0) { > + mlog_errno(ret); > + break; > + } > + > + gd = (struct ocfs2_group_desc *)gd_bh->b_data; > + cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); > + brelse(gd_bh); > + gd_bh = NULL; > + if (cnt< 0) { > + ret = cnt; > + mlog_errno(ret); > + break; > + } > + > + trimmed += cnt; > + len -= osb->bitmap_cpg - first_bit; > + first_bit = 0; > + if (group == osb->first_cluster_group_blkno) > + group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); > + else > + group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); > + } > + range->len = trimmed * sb->s_blocksize; > +out_unlock: > + ocfs2_inode_unlock(main_bm_inode, 0); > + brelse(main_bm_bh); > +out_mutex: > + mutex_unlock(&main_bm_inode->i_mutex); > + iput(main_bm_inode); > +out: > + return ret; > +} > diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h > index 3bd08a0..ca381c5 100644 > --- a/fs/ocfs2/alloc.h > +++ b/fs/ocfs2/alloc.h > @@ -239,6 +239,7 @@ int ocfs2_find_leaf(struct ocfs2_caching_info *ci, > struct buffer_head **leaf_bh); > int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); > > +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range); > /* > * Helper function to look at the # of clusters in an extent record. > */