'Re: [Ocfs2-devel] [PATCH] ocfs2: fix the application IO timeout when fstrim is running'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       ocfs2-devel
Subject:    Re: [Ocfs2-devel] [PATCH] ocfs2: fix the application IO timeout when fstrim is running
From:       Changwei Ge <ge.changwei () h3c ! com>
Date:       2019-01-15 9:33:11
Message-ID: 63ADC13FD55D6546B7DECE290D39E3730127829443 () H3CMLB12-EX ! srv ! huawei-3com ! com
[Download RAW message or body]

Hi Gang,

It looks good to me.

On 2019/1/15 16:23, Gang He wrote:
> Hello ChangWei,
> 
> > > > On 2019/1/15 at 16:00, in message
> <63ADC13FD55D6546B7DECE290D39E37301278265F3@H3CMLB12-EX.srv.huawei-3com.com>,
> Changwei Ge <ge.changwei@h3c.com> wrote:
> > On 2019/1/15 13:49, Gang He wrote:
> > > Hello Changewei,
> > > 
> > > > > > On 2019/1/15 at 11:50, in message
> > > <63ADC13FD55D6546B7DECE290D39E3730127825EFD@H3CMLB12-EX.srv.huawei-3com.com>,
> > > Changwei Ge <ge.changwei@h3c.com> wrote:
> > > > Hi Gang,
> > > > 
> > > > Most parts of this patch look sane to me, just a tiny question...
> > > > 
> > > > On 2019/1/11 17:01, Gang He wrote:
> > > > > The user reported this problem, the upper application IO was
> > > > > timeout when fstrim was running on this ocfs2 partition. the
> > > > > application monitoring resource agent considered that this
> > > > > application did not work, then this node was fenced by the cluster
> > > > > brain (e.g. pacemaker).
> > > > > The root cause is that fstrim thread always holds main_bm meta-file
> > > > > related locks until all the cluster groups are trimmed.
> > > > > This patch will make fstrim thread release main_bm meta-file
> > > > > related locks when each cluster group is trimmed, this will let
> > > > > the current application IO has a chance to claim the clusters from
> > > > > main_bm meta-file.
> > > > > 
> > > > > Signed-off-by: Gang He <ghe@suse.com>

Reviewed-by: Changwei Ge <ge.changwei@h3c.com>

> > > > > ---
> > > > > fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
> > > > > fs/ocfs2/dlmglue.c     |   5 ++
> > > > > fs/ocfs2/ocfs2.h       |   1 +
> > > > > fs/ocfs2/ocfs2_trace.h |   2 +
> > > > > fs/ocfs2/super.c       |   2 +
> > > > > 5 files changed, 106 insertions(+), 63 deletions(-)
> > > > > 
> > > > > diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> > > > > index d1cbb27808e2..6f0999015a44 100644
> > > > > --- a/fs/ocfs2/alloc.c
> > > > > +++ b/fs/ocfs2/alloc.c
> > > > > @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
> > > > > 	return count;
> > > > > }
> > > > > 
> > > > > -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> > > > > +static
> > > > > +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
> > > > > {
> > > > > 	struct ocfs2_super *osb = OCFS2_SB(sb);
> > > > > -	u64 start, len, trimmed, first_group, last_group, group;
> > > > > +	u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
> > > > > 	int ret, cnt;
> > > > > 	u32 first_bit, last_bit, minlen;
> > > > > 	struct buffer_head *main_bm_bh = NULL;
> > > > > @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> > > > fstrim_range *range)
> > > > > 	struct buffer_head *gd_bh = NULL;
> > > > > 	struct ocfs2_dinode *main_bm;
> > > > > 	struct ocfs2_group_desc *gd = NULL;
> > > > > -	struct ocfs2_trim_fs_info info, *pinfo = NULL;
> > > > > 
> > > > > 	start = range->start >> osb->s_clustersize_bits;
> > > > > 	len = range->len >> osb->s_clustersize_bits;
> > > > > @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> > > > fstrim_range *range)
> > > > > 	if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
> > > > > 		return -EINVAL;
> > > > > 
> > > > > +	trace_ocfs2_trim_mainbm(start, len, minlen);
> > > > > +
> > > > > +next_group:
> > > > > 	main_bm_inode = ocfs2_get_system_file_inode(osb,
> > > > > 						    GLOBAL_BITMAP_SYSTEM_INODE,
> > > > > 						    OCFS2_INVALID_SLOT);
> > > > > @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> > > > fstrim_range *range)
> > > > > 	}
> > > > > 	main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
> > > > > 
> > > > > -	if (start >= le32_to_cpu(main_bm->i_clusters)) {
> > > > > -		ret = -EINVAL;
> > > > > -		goto out_unlock;
> > > > > -	}
> > > > > -
> > > > > -	len = range->len >> osb->s_clustersize_bits;
> > > > > -	if (start + len > le32_to_cpu(main_bm->i_clusters))
> > > > > -		len = le32_to_cpu(main_bm->i_clusters) - start;
> > > > > -
> > > > > -	trace_ocfs2_trim_fs(start, len, minlen);
> > > > > -
> > > > > -	ocfs2_trim_fs_lock_res_init(osb);
> > > > > -	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> > > > > -	if (ret < 0) {
> > > > > -		if (ret != -EAGAIN) {
> > > > > -			mlog_errno(ret);
> > > > > -			ocfs2_trim_fs_lock_res_uninit(osb);
> > > > > +	/*
> > > > > +	 * Do some check before trim the first group.
> > > > > +	 */
> > > > > +	if (!group) {
> > > > > +		if (start >= le32_to_cpu(main_bm->i_clusters)) {
> > > > > +			ret = -EINVAL;
> > > > > 			goto out_unlock;
> > > > > 		}
> > > > > 
> > > > > -		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> > > > > -		     "finish, which is running from another node.\n",
> > > > > -		     osb->dev_str);
> > > > > -		ret = ocfs2_trim_fs_lock(osb, &info, 0);
> > > > > -		if (ret < 0) {
> > > > > -			mlog_errno(ret);
> > > > > -			ocfs2_trim_fs_lock_res_uninit(osb);
> > > > > -			goto out_unlock;
> > > > > -		}
> > > > > +		if (start + len > le32_to_cpu(main_bm->i_clusters))
> > > > > +			len = le32_to_cpu(main_bm->i_clusters) - start;
> > > > > 
> > > > > -		if (info.tf_valid && info.tf_success &&
> > > > > -		    info.tf_start == start && info.tf_len == len &&
> > > > > -		    info.tf_minlen == minlen) {
> > > > > -			/* Avoid sending duplicated trim to a shared device */
> > > > > -			mlog(ML_NOTICE, "The same trim on device (%s) was "
> > > > > -			     "just done from node (%u), return.\n",
> > > > > -			     osb->dev_str, info.tf_nodenum);
> > > > > -			range->len = info.tf_trimlen;
> > > > > -			goto out_trimunlock;
> > > > > -		}
> > > > > +		/*
> > > > > +		 * Determine first and last group to examine based on
> > > > > +		 * start and len
> > > > > +		 */
> > > > > +		first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> > > > > +		if (first_group == osb->first_cluster_group_blkno)
> > > > > +			first_bit = start;
> > > > > +		else
> > > > > +			first_bit = start - ocfs2_blocks_to_clusters(sb,
> > > > > +								first_group);
> > > > > +		last_group = ocfs2_which_cluster_group(main_bm_inode,
> > > > > +						       start + len - 1);
> > > > > +		group = first_group;
> > > > > 	}
> > > > > 
> > > > > -	info.tf_nodenum = osb->node_num;
> > > > > -	info.tf_start = start;
> > > > > -	info.tf_len = len;
> > > > > -	info.tf_minlen = minlen;
> > > > > -
> > > > > -	/* Determine first and last group to examine based on start and len */
> > > > > -	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> > > > > -	if (first_group == osb->first_cluster_group_blkno)
> > > > > -		first_bit = start;
> > > > > -	else
> > > > > -		first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> > > > > -	last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> > > > > -	last_bit = osb->bitmap_cpg;
> > > > > -
> > > > > -	trimmed = 0;
> > > > > -	for (group = first_group; group <= last_group;) {
> > > > > +	do {
> > > > > 		if (first_bit + len >= osb->bitmap_cpg)
> > > > > 			last_bit = osb->bitmap_cpg;
> > > > > 		else
> > > > > @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> > > > fstrim_range *range)
> > > > > 			group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> > > > > 		else
> > > > > 			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> > > > > -	}
> > > > > -	range->len = trimmed * sb->s_blocksize;
> > > > > +	} while (0);
> > > > > 
> > > > > -	info.tf_trimlen = range->len;
> > > > > -	info.tf_success = (ret ? 0 : 1);
> > > > > -	pinfo = &info;
> > > > > -out_trimunlock:
> > > > > -	ocfs2_trim_fs_unlock(osb, pinfo);
> > > > > -	ocfs2_trim_fs_lock_res_uninit(osb);
> > > > > out_unlock:
> > > > > 	ocfs2_inode_unlock(main_bm_inode, 0);
> > > > > 	brelse(main_bm_bh);
> > > > > +	main_bm_bh = NULL;
> > > > > out_mutex:
> > > > > 	inode_unlock(main_bm_inode);
> > > > > 	iput(main_bm_inode);
> > > > > +
> > > > > +	/*
> > > > > +	 * If all the groups trim are not done or failed, but we should release
> > > > > +	 * main_bm related locks for avoiding the current IO starve, then go to
> > > > > +	 * trim the next group
> > > > > +	 */
> > > > > +	if (ret >= 0 && group <= last_group)
> > > > > +		goto next_group;
> > > > > out:
> > > > > +	range->len = trimmed * sb->s_blocksize;
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> > > > > +{
> > > > > +	int ret;
> > > > > +	struct ocfs2_super *osb = OCFS2_SB(sb);
> > > > > +	struct ocfs2_trim_fs_info info, *pinfo = NULL;
> > > > > +
> > > > > +	ocfs2_trim_fs_lock_res_init(osb);
> > > > > +
> > > > > +	trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
> > > > > +
> > > > > +	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> > > > > +	if (ret < 0) {
> > > > > +		if (ret != -EAGAIN) {
> > > > > +			mlog_errno(ret);
> > > > > +			ocfs2_trim_fs_lock_res_uninit(osb);
> > > > > +			return ret;
> > > > > +		}
> > > > > +
> > > > > +		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> > > > > +		     "finish, which is running from another node.\n",
> > > > > +		     osb->dev_str);
> > > > > +		ret = ocfs2_trim_fs_lock(osb, &info, 0);
> > > > > +		if (ret < 0) {
> > > > > +			mlog_errno(ret);
> > > > > +			ocfs2_trim_fs_lock_res_uninit(osb);
> > > > > +			return ret;
> > > > > +		}
> > > > > +
> > > > > +		if (info.tf_valid && info.tf_success &&
> > > > > +		    info.tf_start == range->start &&
> > > > > +		    info.tf_len == range->len &&
> > > > > +		    info.tf_minlen == range->minlen) {
> > > > > +			/* Avoid sending duplicated trim to a shared device */
> > > > > +			mlog(ML_NOTICE, "The same trim on device (%s) was "
> > > > > +			     "just done from node (%u), return.\n",
> > > > > +			     osb->dev_str, info.tf_nodenum);
> > > > > +			range->len = info.tf_trimlen;
> > > > > +			goto out;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	info.tf_nodenum = osb->node_num;
> > > > > +	info.tf_start = range->start;
> > > > > +	info.tf_len = range->len;
> > > > > +	info.tf_minlen = range->minlen;
> > > > > +
> > > > > +	ret = ocfs2_trim_mainbm(sb, range);
> > > > > +
> > > > > +	info.tf_trimlen = range->len;
> > > > > +	info.tf_success = (ret < 0 ? 0 : 1);
> > > > > +	pinfo = &info;
> > > > > +out:
> > > > > +	ocfs2_trim_fs_unlock(osb, pinfo);
> > > > > +	ocfs2_trim_fs_lock_res_uninit(osb);
> > > > > 	return ret;
> > > > > }
> > > > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> > > > > index 7c835824247e..af405586c5b1 100644
> > > > > --- a/fs/ocfs2/dlmglue.c
> > > > > +++ b/fs/ocfs2/dlmglue.c
> > > > > @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super \
> > > > > *osb) {
> > > > > 	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
> > > > > 
> > > > > +	/* Only one trimfs thread are allowed to work at the same time. */
> > > > > +	mutex_lock(&osb->obs_trim_fs_mutex);
> > > > > +
> > > > 
> > > > Cluster lock of fstrim have a trylock behavior, will it be better if we
> > > > trylock here?
> > > Here, I prefer to just serialize fstrim threads on the local node to
> > simplify the code logic,
> > > maybe the user want to do like that, although this behavior is not
> > recommended.
> > > You know, on one node, ideally, the user should call  fstrim command once
> > regularly.
> > > If he calls fstrim command more times in a very short time,
> > > the code will not make each fstrim command return to failure, just do the
> > fstrim task one by one.
> > 
> > I have a thought having nothing to do with your patch.
> > Do you think it's possible for us to implement *discard on unlink file* like
> > ext4 does.
> > So the application doesn't have to invoke fstrim periodically.
> Yes, we can do some investigation for adding a mount option "discard" to support \
> discard a file when it is deleted. This can be considered as another feature, since \
> its discard occasion and granularity is different with fstrim. The scheduled fstrim \
> command can considered as a traditional file system level trim. If the file system \
> supports the on-demand discard when some blocks were released, that's better. Of \
> course. these two features can coexist.

Good to hear this. :-)

> 
> Thanks
> Gang
> 
> > 
> > Thanks,
> > Changwei
> > 
> > > 
> > > Thanks
> > > Gang
> > > 
> > > > 
> > > > Thanks,
> > > > Changwei
> > > > 
> > > > > 	ocfs2_lock_res_init_once(lockres);
> > > > > 	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
> > > > > 	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
> > > > > @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super
> > > > *osb)
> > > > > 
> > > > > 	ocfs2_simple_drop_lockres(osb, lockres);
> > > > > 	ocfs2_lock_res_free(lockres);
> > > > > +
> > > > > +	mutex_unlock(&osb->obs_trim_fs_mutex);
> > > > > }
> > > > > 
> > > > > static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> > > > > diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> > > > > index 4f86ac0027b5..1f029fbe8b8d 100644
> > > > > --- a/fs/ocfs2/ocfs2.h
> > > > > +++ b/fs/ocfs2/ocfs2.h
> > > > > @@ -407,6 +407,7 @@ struct ocfs2_super
> > > > > 	struct ocfs2_lock_res osb_rename_lockres;
> > > > > 	struct ocfs2_lock_res osb_nfs_sync_lockres;
> > > > > 	struct ocfs2_lock_res osb_trim_fs_lockres;
> > > > > +	struct mutex obs_trim_fs_mutex;
> > > > > 	struct ocfs2_dlm_debug *osb_dlm_debug;
> > > > > 
> > > > > 	struct dentry *osb_debug_root;
> > > > > diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
> > > > > index 2ee76a90ba8f..dc4bce1649c1 100644
> > > > > --- a/fs/ocfs2/ocfs2_trace.h
> > > > > +++ b/fs/ocfs2/ocfs2_trace.h
> > > > > @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
> > > > > 
> > > > > DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
> > > > > 
> > > > > +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
> > > > > +
> > > > > DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
> > > > > 
> > > > > /* End of trace events for fs/ocfs2/alloc.c. */
> > > > > diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> > > > > index 3415e0b09398..96ae7cedd487 100644
> > > > > --- a/fs/ocfs2/super.c
> > > > > +++ b/fs/ocfs2/super.c
> > > > > @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
> > > > > 	if (ocfs2_is_hard_readonly(osb))
> > > > > 		goto leave;
> > > > > 
> > > > > +	mutex_init(&osb->obs_trim_fs_mutex);
> > > > > +
> > > > > 	status = ocfs2_dlm_init(osb);
> > > > > 	if (status < 0) {
> > > > > 		mlog_errno(status);
> > > > > 
> > > 
> 

_______________________________________________
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel

[prev in list] [next in list] [prev in thread] [next in thread]