thr3ads.net - Ocfs2 devel - [Ocfs2-devel] [PATCH] ocfs2: fix the application IO timeout when fstrim is running [Jan 2019]

If this information is useful, please help other people find it:
Share via:

Changwei Ge

2019-Jan-15 08:00 UTC

[Ocfs2-devel] [PATCH] ocfs2: fix the application IO timeout when fstrim is running

On 2019/1/15 13:49, Gang He wrote:> Hello Changewei,
> 
>>>> On 2019/1/15 at 11:50, in message
> <63ADC13FD55D6546B7DECE290D39E3730127825EFD at
H3CMLB12-EX.srv.huawei-3com.com>,
> Changwei Ge <ge.changwei at h3c.com> wrote:
>> Hi Gang,
>>
>> Most parts of this patch look sane to me, just a tiny question...
>>
>> On 2019/1/11 17:01, Gang He wrote:
>>> The user reported this problem, the upper application IO was
>>> timeout when fstrim was running on this ocfs2 partition. the
>>> application monitoring resource agent considered that this
>>> application did not work, then this node was fenced by the cluster
>>> brain (e.g. pacemaker).
>>> The root cause is that fstrim thread always holds main_bm meta-file
>>> related locks until all the cluster groups are trimmed.
>>> This patch will make fstrim thread release main_bm meta-file
>>> related locks when each cluster group is trimmed, this will let
>>> the current application IO has a chance to claim the clusters from
>>> main_bm meta-file.
>>>
>>> Signed-off-by: Gang He <ghe at suse.com>
>>> ---
>>>    fs/ocfs2/alloc.c       | 159
+++++++++++++++++++++++++----------------
>>>    fs/ocfs2/dlmglue.c     |   5 ++
>>>    fs/ocfs2/ocfs2.h       |   1 +
>>>    fs/ocfs2/ocfs2_trace.h |   2 +
>>>    fs/ocfs2/super.c       |   2 +
>>>    5 files changed, 106 insertions(+), 63 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index d1cbb27808e2..6f0999015a44 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct
super_block *sb,
>>>    	return count;
>>>    }
>>>    
>>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range
*range)
>>> +static
>>> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range
*range)
>>>    {
>>>    	struct ocfs2_super *osb = OCFS2_SB(sb);
>>> -	u64 start, len, trimmed, first_group, last_group, group;
>>> +	u64 start, len, trimmed = 0, first_group, last_group = 0, group =
0;
>>>    	int ret, cnt;
>>>    	u32 first_bit, last_bit, minlen;
>>>    	struct buffer_head *main_bm_bh = NULL;
>>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb,
struct
>> fstrim_range *range)
>>>    	struct buffer_head *gd_bh = NULL;
>>>    	struct ocfs2_dinode *main_bm;
>>>    	struct ocfs2_group_desc *gd = NULL;
>>> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>    
>>>    	start = range->start >> osb->s_clustersize_bits;
>>>    	len = range->len >> osb->s_clustersize_bits;
>>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb,
struct
>> fstrim_range *range)
>>>    	if (minlen >= osb->bitmap_cpg || range->len <
sb->s_blocksize)
>>>    		return -EINVAL;
>>>    
>>> +	trace_ocfs2_trim_mainbm(start, len, minlen);
>>> +
>>> +next_group:
>>>    	main_bm_inode = ocfs2_get_system_file_inode(osb,
>>>    						    GLOBAL_BITMAP_SYSTEM_INODE,
>>>    						    OCFS2_INVALID_SLOT);
>>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb,
struct
>> fstrim_range *range)
>>>    	}
>>>    	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>>    
>>> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>> -		ret = -EINVAL;
>>> -		goto out_unlock;
>>> -	}
>>> -
>>> -	len = range->len >> osb->s_clustersize_bits;
>>> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
>>> -		len = le32_to_cpu(main_bm->i_clusters) - start;
>>> -
>>> -	trace_ocfs2_trim_fs(start, len, minlen);
>>> -
>>> -	ocfs2_trim_fs_lock_res_init(osb);
>>> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>> -	if (ret < 0) {
>>> -		if (ret != -EAGAIN) {
>>> -			mlog_errno(ret);
>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>> +	/*
>>> +	 * Do some check before trim the first group.
>>> +	 */
>>> +	if (!group) {
>>> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>> +			ret = -EINVAL;
>>>    			goto out_unlock;
>>>    		}
>>>    
>>> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> -		     "finish, which is running from another node.\n",
>>> -		     osb->dev_str);
>>> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>> -		if (ret < 0) {
>>> -			mlog_errno(ret);
>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>> -			goto out_unlock;
>>> -		}
>>> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
>>> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>>>    
>>> -		if (info.tf_valid && info.tf_success &&
>>> -		    info.tf_start == start && info.tf_len == len
&&
>>> -		    info.tf_minlen == minlen) {
>>> -			/* Avoid sending duplicated trim to a shared device */
>>> -			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>> -			     "just done from node (%u), return.\n",
>>> -			     osb->dev_str, info.tf_nodenum);
>>> -			range->len = info.tf_trimlen;
>>> -			goto out_trimunlock;
>>> -		}
>>> +		/*
>>> +		 * Determine first and last group to examine based on
>>> +		 * start and len
>>> +		 */
>>> +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> +		if (first_group == osb->first_cluster_group_blkno)
>>> +			first_bit = start;
>>> +		else
>>> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
>>> +								first_group);
>>> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
>>> +						       start + len - 1);
>>> +		group = first_group;
>>>    	}
>>>    
>>> -	info.tf_nodenum = osb->node_num;
>>> -	info.tf_start = start;
>>> -	info.tf_len = len;
>>> -	info.tf_minlen = minlen;
>>> -
>>> -	/* Determine first and last group to examine based on start and
len */
>>> -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> -	if (first_group == osb->first_cluster_group_blkno)
>>> -		first_bit = start;
>>> -	else
>>> -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
>>> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len
- 1);
>>> -	last_bit = osb->bitmap_cpg;
>>> -
>>> -	trimmed = 0;
>>> -	for (group = first_group; group <= last_group;) {
>>> +	do {
>>>    		if (first_bit + len >= osb->bitmap_cpg)
>>>    			last_bit = osb->bitmap_cpg;
>>>    		else
>>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb,
struct
>> fstrim_range *range)
>>>    			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>>    		else
>>>    			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>> -	}
>>> -	range->len = trimmed * sb->s_blocksize;
>>> +	} while (0);
>>>    
>>> -	info.tf_trimlen = range->len;
>>> -	info.tf_success = (ret ? 0 : 1);
>>> -	pinfo = &info;
>>> -out_trimunlock:
>>> -	ocfs2_trim_fs_unlock(osb, pinfo);
>>> -	ocfs2_trim_fs_lock_res_uninit(osb);
>>>    out_unlock:
>>>    	ocfs2_inode_unlock(main_bm_inode, 0);
>>>    	brelse(main_bm_bh);
>>> +	main_bm_bh = NULL;
>>>    out_mutex:
>>>    	inode_unlock(main_bm_inode);
>>>    	iput(main_bm_inode);
>>> +
>>> +	/*
>>> +	 * If all the groups trim are not done or failed, but we should
release
>>> +	 * main_bm related locks for avoiding the current IO starve, then
go to
>>> +	 * trim the next group
>>> +	 */
>>> +	if (ret >= 0 && group <= last_group)
>>> +		goto next_group;
>>>    out:
>>> +	range->len = trimmed * sb->s_blocksize;
>>> +	return ret;
>>> +}
>>> +
>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range
*range)
>>> +{
>>> +	int ret;
>>> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>>> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>> +
>>> +	ocfs2_trim_fs_lock_res_init(osb);
>>> +
>>> +	trace_ocfs2_trim_fs(range->start, range->len,
range->minlen);
>>> +
>>> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>> +	if (ret < 0) {
>>> +		if (ret != -EAGAIN) {
>>> +			mlog_errno(ret);
>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>> +			return ret;
>>> +		}
>>> +
>>> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> +		     "finish, which is running from another node.\n",
>>> +		     osb->dev_str);
>>> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>> +		if (ret < 0) {
>>> +			mlog_errno(ret);
>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>> +			return ret;
>>> +		}
>>> +
>>> +		if (info.tf_valid && info.tf_success &&
>>> +		    info.tf_start == range->start &&
>>> +		    info.tf_len == range->len &&
>>> +		    info.tf_minlen == range->minlen) {
>>> +			/* Avoid sending duplicated trim to a shared device */
>>> +			mlog(ML_NOTICE, "The same trim on device (%s) was "
>>> +			     "just done from node (%u), return.\n",
>>> +			     osb->dev_str, info.tf_nodenum);
>>> +			range->len = info.tf_trimlen;
>>> +			goto out;
>>> +		}
>>> +	}
>>> +
>>> +	info.tf_nodenum = osb->node_num;
>>> +	info.tf_start = range->start;
>>> +	info.tf_len = range->len;
>>> +	info.tf_minlen = range->minlen;
>>> +
>>> +	ret = ocfs2_trim_mainbm(sb, range);
>>> +
>>> +	info.tf_trimlen = range->len;
>>> +	info.tf_success = (ret < 0 ? 0 : 1);
>>> +	pinfo = &info;
>>> +out:
>>> +	ocfs2_trim_fs_unlock(osb, pinfo);
>>> +	ocfs2_trim_fs_lock_res_uninit(osb);
>>>    	return ret;
>>>    }
>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>> index 7c835824247e..af405586c5b1 100644
>>> --- a/fs/ocfs2/dlmglue.c
>>> +++ b/fs/ocfs2/dlmglue.c
>>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct
ocfs2_super *osb)
>>>    {
>>>    	struct ocfs2_lock_res *lockres =
&osb->osb_trim_fs_lockres;
>>>    
>>> +	/* Only one trimfs thread are allowed to work at the same time.
*/
>>> +	mutex_lock(&osb->obs_trim_fs_mutex);
>>> +
>>
>> Cluster lock of fstrim have a trylock behavior, will it be better if we
>> trylock here?
> Here, I prefer to just serialize fstrim threads on the local node to
simplify the code logic,
> maybe the user want to do like that, although this behavior is not
recommended.
> You know, on one node, ideally, the user should call  fstrim command once
regularly.
> If he calls fstrim command more times in a very short time,
> the code will not make each fstrim command return to failure, just do the
fstrim task one by one.
I have a thought having nothing to do with your patch.
Do you think it's possible for us to implement *discard on unlink file* like
ext4 does.
So the application doesn't have to invoke fstrim periodically.

Thanks,
Changwei
> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Changwei
>>
>>>    	ocfs2_lock_res_init_once(lockres);
>>>    	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0,
lockres->l_name);
>>>    	ocfs2_lock_res_init_common(osb, lockres,
OCFS2_LOCK_TYPE_TRIM_FS,
>>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct
ocfs2_super
>> *osb)
>>>    
>>>    	ocfs2_simple_drop_lockres(osb, lockres);
>>>    	ocfs2_lock_res_free(lockres);
>>> +
>>> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>>>    }
>>>    
>>>    static void ocfs2_orphan_scan_lock_res_init(struct
ocfs2_lock_res *res,
>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>> index 4f86ac0027b5..1f029fbe8b8d 100644
>>> --- a/fs/ocfs2/ocfs2.h
>>> +++ b/fs/ocfs2/ocfs2.h
>>> @@ -407,6 +407,7 @@ struct ocfs2_super
>>>    	struct ocfs2_lock_res osb_rename_lockres;
>>>    	struct ocfs2_lock_res osb_nfs_sync_lockres;
>>>    	struct ocfs2_lock_res osb_trim_fs_lockres;
>>> +	struct mutex obs_trim_fs_mutex;
>>>    	struct ocfs2_dlm_debug *osb_dlm_debug;
>>>    
>>>    	struct dentry *osb_debug_root;
>>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
>>> index 2ee76a90ba8f..dc4bce1649c1 100644
>>> --- a/fs/ocfs2/ocfs2_trace.h
>>> +++ b/fs/ocfs2/ocfs2_trace.h
>>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>>>    
>>>    DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>>>    
>>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
>>> +
>>>    DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>>>    
>>>    /* End of trace events for fs/ocfs2/alloc.c. */
>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>> index 3415e0b09398..96ae7cedd487 100644
>>> --- a/fs/ocfs2/super.c
>>> +++ b/fs/ocfs2/super.c
>>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct
super_block *sb)
>>>    	if (ocfs2_is_hard_readonly(osb))
>>>    		goto leave;
>>>    
>>> +	mutex_init(&osb->obs_trim_fs_mutex);
>>> +
>>>    	status = ocfs2_dlm_init(osb);
>>>    	if (status < 0) {
>>>    		mlog_errno(status);
>>>
>

Gang He

2019-Jan-15 08:22 UTC

head link

[Ocfs2-devel] [PATCH] ocfs2: fix the application IO timeout when fstrim is running

Hello ChangWei,
>>> On 2019/1/15 at 16:00, in message<63ADC13FD55D6546B7DECE290D39E37301278265F3 at
H3CMLB12-EX.srv.huawei-3com.com>,
Changwei Ge <ge.changwei at h3c.com> wrote:> On 2019/1/15 13:49, Gang He wrote:
>> Hello Changewei,
>> 
>>>>> On 2019/1/15 at 11:50, in message
>> <63ADC13FD55D6546B7DECE290D39E3730127825EFD at
H3CMLB12-EX.srv.huawei-3com.com>,
>> Changwei Ge <ge.changwei at h3c.com> wrote:
>>> Hi Gang,
>>>
>>> Most parts of this patch look sane to me, just a tiny question...
>>>
>>> On 2019/1/11 17:01, Gang He wrote:
>>>> The user reported this problem, the upper application IO was
>>>> timeout when fstrim was running on this ocfs2 partition. the
>>>> application monitoring resource agent considered that this
>>>> application did not work, then this node was fenced by the
cluster
>>>> brain (e.g. pacemaker).
>>>> The root cause is that fstrim thread always holds main_bm
meta-file
>>>> related locks until all the cluster groups are trimmed.
>>>> This patch will make fstrim thread release main_bm meta-file
>>>> related locks when each cluster group is trimmed, this will let
>>>> the current application IO has a chance to claim the clusters
from
>>>> main_bm meta-file.
>>>>
>>>> Signed-off-by: Gang He <ghe at suse.com>
>>>> ---
>>>>    fs/ocfs2/alloc.c       | 159
+++++++++++++++++++++++++----------------
>>>>    fs/ocfs2/dlmglue.c     |   5 ++
>>>>    fs/ocfs2/ocfs2.h       |   1 +
>>>>    fs/ocfs2/ocfs2_trace.h |   2 +
>>>>    fs/ocfs2/super.c       |   2 +
>>>>    5 files changed, 106 insertions(+), 63 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>>> index d1cbb27808e2..6f0999015a44 100644
>>>> --- a/fs/ocfs2/alloc.c
>>>> +++ b/fs/ocfs2/alloc.c
>>>> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct
super_block *sb,
>>>>    	return count;
>>>>    }
>>>>    
>>>> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range
*range)
>>>> +static
>>>> +int ocfs2_trim_mainbm(struct super_block *sb, struct
fstrim_range *range)
>>>>    {
>>>>    	struct ocfs2_super *osb = OCFS2_SB(sb);
>>>> -	u64 start, len, trimmed, first_group, last_group, group;
>>>> +	u64 start, len, trimmed = 0, first_group, last_group = 0,
group = 0;
>>>>    	int ret, cnt;
>>>>    	u32 first_bit, last_bit, minlen;
>>>>    	struct buffer_head *main_bm_bh = NULL;
>>>> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb,
struct
>>> fstrim_range *range)
>>>>    	struct buffer_head *gd_bh = NULL;
>>>>    	struct ocfs2_dinode *main_bm;
>>>>    	struct ocfs2_group_desc *gd = NULL;
>>>> -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>>    
>>>>    	start = range->start >>
osb->s_clustersize_bits;
>>>>    	len = range->len >> osb->s_clustersize_bits;
>>>> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb,
struct
>>> fstrim_range *range)
>>>>    	if (minlen >= osb->bitmap_cpg || range->len <
sb->s_blocksize)
>>>>    		return -EINVAL;
>>>>    
>>>> +	trace_ocfs2_trim_mainbm(start, len, minlen);
>>>> +
>>>> +next_group:
>>>>    	main_bm_inode = ocfs2_get_system_file_inode(osb,
>>>>    						    GLOBAL_BITMAP_SYSTEM_INODE,
>>>>    						    OCFS2_INVALID_SLOT);
>>>> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block
*sb, struct
>>> fstrim_range *range)
>>>>    	}
>>>>    	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>>>>    
>>>> -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>> -		ret = -EINVAL;
>>>> -		goto out_unlock;
>>>> -	}
>>>> -
>>>> -	len = range->len >> osb->s_clustersize_bits;
>>>> -	if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>> -		len = le32_to_cpu(main_bm->i_clusters) - start;
>>>> -
>>>> -	trace_ocfs2_trim_fs(start, len, minlen);
>>>> -
>>>> -	ocfs2_trim_fs_lock_res_init(osb);
>>>> -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>> -	if (ret < 0) {
>>>> -		if (ret != -EAGAIN) {
>>>> -			mlog_errno(ret);
>>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> +	/*
>>>> +	 * Do some check before trim the first group.
>>>> +	 */
>>>> +	if (!group) {
>>>> +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
>>>> +			ret = -EINVAL;
>>>>    			goto out_unlock;
>>>>    		}
>>>>    
>>>> -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>> -		     "finish, which is running from another
node.\n",
>>>> -		     osb->dev_str);
>>>> -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>> -		if (ret < 0) {
>>>> -			mlog_errno(ret);
>>>> -			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> -			goto out_unlock;
>>>> -		}
>>>> +		if (start + len > le32_to_cpu(main_bm->i_clusters))
>>>> +			len = le32_to_cpu(main_bm->i_clusters) - start;
>>>>    
>>>> -		if (info.tf_valid && info.tf_success &&
>>>> -		    info.tf_start == start && info.tf_len == len
&&
>>>> -		    info.tf_minlen == minlen) {
>>>> -			/* Avoid sending duplicated trim to a shared device */
>>>> -			mlog(ML_NOTICE, "The same trim on device (%s) was
"
>>>> -			     "just done from node (%u), return.\n",
>>>> -			     osb->dev_str, info.tf_nodenum);
>>>> -			range->len = info.tf_trimlen;
>>>> -			goto out_trimunlock;
>>>> -		}
>>>> +		/*
>>>> +		 * Determine first and last group to examine based on
>>>> +		 * start and len
>>>> +		 */
>>>> +		first_group = ocfs2_which_cluster_group(main_bm_inode,
start);
>>>> +		if (first_group == osb->first_cluster_group_blkno)
>>>> +			first_bit = start;
>>>> +		else
>>>> +			first_bit = start - ocfs2_blocks_to_clusters(sb,
>>>> +								first_group);
>>>> +		last_group = ocfs2_which_cluster_group(main_bm_inode,
>>>> +						       start + len - 1);
>>>> +		group = first_group;
>>>>    	}
>>>>    
>>>> -	info.tf_nodenum = osb->node_num;
>>>> -	info.tf_start = start;
>>>> -	info.tf_len = len;
>>>> -	info.tf_minlen = minlen;
>>>> -
>>>> -	/* Determine first and last group to examine based on start
and len */
>>>> -	first_group = ocfs2_which_cluster_group(main_bm_inode,
start);
>>>> -	if (first_group == osb->first_cluster_group_blkno)
>>>> -		first_bit = start;
>>>> -	else
>>>> -		first_bit = start - ocfs2_blocks_to_clusters(sb,
first_group);
>>>> -	last_group = ocfs2_which_cluster_group(main_bm_inode, start +
len - 1);
>>>> -	last_bit = osb->bitmap_cpg;
>>>> -
>>>> -	trimmed = 0;
>>>> -	for (group = first_group; group <= last_group;) {
>>>> +	do {
>>>>    		if (first_bit + len >= osb->bitmap_cpg)
>>>>    			last_bit = osb->bitmap_cpg;
>>>>    		else
>>>> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block
*sb, struct
>>> fstrim_range *range)
>>>>    			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>>>    		else
>>>>    			group += ocfs2_clusters_to_blocks(sb,
osb->bitmap_cpg);
>>>> -	}
>>>> -	range->len = trimmed * sb->s_blocksize;
>>>> +	} while (0);
>>>>    
>>>> -	info.tf_trimlen = range->len;
>>>> -	info.tf_success = (ret ? 0 : 1);
>>>> -	pinfo = &info;
>>>> -out_trimunlock:
>>>> -	ocfs2_trim_fs_unlock(osb, pinfo);
>>>> -	ocfs2_trim_fs_lock_res_uninit(osb);
>>>>    out_unlock:
>>>>    	ocfs2_inode_unlock(main_bm_inode, 0);
>>>>    	brelse(main_bm_bh);
>>>> +	main_bm_bh = NULL;
>>>>    out_mutex:
>>>>    	inode_unlock(main_bm_inode);
>>>>    	iput(main_bm_inode);
>>>> +
>>>> +	/*
>>>> +	 * If all the groups trim are not done or failed, but we
should release
>>>> +	 * main_bm related locks for avoiding the current IO starve,
then go to
>>>> +	 * trim the next group
>>>> +	 */
>>>> +	if (ret >= 0 && group <= last_group)
>>>> +		goto next_group;
>>>>    out:
>>>> +	range->len = trimmed * sb->s_blocksize;
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range
*range)
>>>> +{
>>>> +	int ret;
>>>> +	struct ocfs2_super *osb = OCFS2_SB(sb);
>>>> +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>> +
>>>> +	ocfs2_trim_fs_lock_res_init(osb);
>>>> +
>>>> +	trace_ocfs2_trim_fs(range->start, range->len,
range->minlen);
>>>> +
>>>> +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>> +	if (ret < 0) {
>>>> +		if (ret != -EAGAIN) {
>>>> +			mlog_errno(ret);
>>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> +			return ret;
>>>> +		}
>>>> +
>>>> +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>>> +		     "finish, which is running from another
node.\n",
>>>> +		     osb->dev_str);
>>>> +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
>>>> +		if (ret < 0) {
>>>> +			mlog_errno(ret);
>>>> +			ocfs2_trim_fs_lock_res_uninit(osb);
>>>> +			return ret;
>>>> +		}
>>>> +
>>>> +		if (info.tf_valid && info.tf_success &&
>>>> +		    info.tf_start == range->start &&
>>>> +		    info.tf_len == range->len &&
>>>> +		    info.tf_minlen == range->minlen) {
>>>> +			/* Avoid sending duplicated trim to a shared device */
>>>> +			mlog(ML_NOTICE, "The same trim on device (%s) was
"
>>>> +			     "just done from node (%u), return.\n",
>>>> +			     osb->dev_str, info.tf_nodenum);
>>>> +			range->len = info.tf_trimlen;
>>>> +			goto out;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	info.tf_nodenum = osb->node_num;
>>>> +	info.tf_start = range->start;
>>>> +	info.tf_len = range->len;
>>>> +	info.tf_minlen = range->minlen;
>>>> +
>>>> +	ret = ocfs2_trim_mainbm(sb, range);
>>>> +
>>>> +	info.tf_trimlen = range->len;
>>>> +	info.tf_success = (ret < 0 ? 0 : 1);
>>>> +	pinfo = &info;
>>>> +out:
>>>> +	ocfs2_trim_fs_unlock(osb, pinfo);
>>>> +	ocfs2_trim_fs_lock_res_uninit(osb);
>>>>    	return ret;
>>>>    }
>>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>>> index 7c835824247e..af405586c5b1 100644
>>>> --- a/fs/ocfs2/dlmglue.c
>>>> +++ b/fs/ocfs2/dlmglue.c
>>>> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct
ocfs2_super *osb)
>>>>    {
>>>>    	struct ocfs2_lock_res *lockres =
&osb->osb_trim_fs_lockres;
>>>>    
>>>> +	/* Only one trimfs thread are allowed to work at the same
time. */
>>>> +	mutex_lock(&osb->obs_trim_fs_mutex);
>>>> +
>>>
>>> Cluster lock of fstrim have a trylock behavior, will it be better
if we
>>> trylock here?
>> Here, I prefer to just serialize fstrim threads on the local node to 
> simplify the code logic,
>> maybe the user want to do like that, although this behavior is not 
> recommended.
>> You know, on one node, ideally, the user should call  fstrim command
once
> regularly.
>> If he calls fstrim command more times in a very short time,
>> the code will not make each fstrim command return to failure, just do
the
> fstrim task one by one.
> 
> I have a thought having nothing to do with your patch.
> Do you think it's possible for us to implement *discard on unlink file*
like
> ext4 does.
> So the application doesn't have to invoke fstrim periodically.Yes, we can do some investigation for adding a mount option "discard"
to support discard a file when it is deleted.
This can be considered as another feature, since its discard occasion and
granularity is different with fstrim.
The scheduled fstrim command can considered as a traditional file system level
trim.
If the file system supports the on-demand discard when some blocks were
released, that's better.
Of course. these two features can coexist.

Thanks
Gang
> 
> Thanks,
> Changwei
> 
>> 
>> Thanks
>> Gang
>> 
>>>
>>> Thanks,
>>> Changwei
>>>
>>>>    	ocfs2_lock_res_init_once(lockres);
>>>>    	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0,
lockres->l_name);
>>>>    	ocfs2_lock_res_init_common(osb, lockres,
OCFS2_LOCK_TYPE_TRIM_FS,
>>>> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct
ocfs2_super
>>> *osb)
>>>>    
>>>>    	ocfs2_simple_drop_lockres(osb, lockres);
>>>>    	ocfs2_lock_res_free(lockres);
>>>> +
>>>> +	mutex_unlock(&osb->obs_trim_fs_mutex);
>>>>    }
>>>>    
>>>>    static void ocfs2_orphan_scan_lock_res_init(struct
ocfs2_lock_res *res,
>>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
>>>> index 4f86ac0027b5..1f029fbe8b8d 100644
>>>> --- a/fs/ocfs2/ocfs2.h
>>>> +++ b/fs/ocfs2/ocfs2.h
>>>> @@ -407,6 +407,7 @@ struct ocfs2_super
>>>>    	struct ocfs2_lock_res osb_rename_lockres;
>>>>    	struct ocfs2_lock_res osb_nfs_sync_lockres;
>>>>    	struct ocfs2_lock_res osb_trim_fs_lockres;
>>>> +	struct mutex obs_trim_fs_mutex;
>>>>    	struct ocfs2_dlm_debug *osb_dlm_debug;
>>>>    
>>>>    	struct dentry *osb_debug_root;
>>>> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
>>>> index 2ee76a90ba8f..dc4bce1649c1 100644
>>>> --- a/fs/ocfs2/ocfs2_trace.h
>>>> +++ b/fs/ocfs2/ocfs2_trace.h
>>>> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>>>>    
>>>>    DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>>>>    
>>>> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
>>>> +
>>>>    DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>>>>    
>>>>    /* End of trace events for fs/ocfs2/alloc.c. */
>>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
>>>> index 3415e0b09398..96ae7cedd487 100644
>>>> --- a/fs/ocfs2/super.c
>>>> +++ b/fs/ocfs2/super.c
>>>> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct
super_block *sb)
>>>>    	if (ocfs2_is_hard_readonly(osb))
>>>>    		goto leave;
>>>>    
>>>> +	mutex_init(&osb->obs_trim_fs_mutex);
>>>> +
>>>>    	status = ocfs2_dlm_init(osb);
>>>>    	if (status < 0) {
>>>>    		mlog_errno(status);
>>>>
>>

Ocfs2 devel - Jan 2019 - [PATCH] ocfs2: fix the application IO timeout when fstrim is running

[Ocfs2-devel] [PATCH] ocfs2: fix the application IO timeout when fstrim is running

[Ocfs2-devel] [PATCH] ocfs2: fix the application IO timeout when fstrim is running