thr3ads.net - Btrfs devel - [PATCH v3 1/5] add metadata

If this information is useful, please help other people find it:
Share via:

Shaohua Li

2011-Jan-19 01:15 UTC

[PATCH v3 1/5] add metadata_incore ioctl in vfs

Subject: add metadata_incore ioctl in vfs

Add an ioctl to dump filesystem''s metadata in memory in vfs. Userspace
collects
such info and uses it to do metadata readahead.
Filesystem can hook to super_operations.metadata_incore to get metadata in
specific approach. Next patch will give an example how to implement
.metadata_incore in btrfs.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/compat_ioctl.c  |    2 ++
 fs/ioctl.c         |   42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |   10 ++++++++++
 3 files changed, 54 insertions(+)

Index: linux/fs/ioctl.c
==================================================================---
linux.orig/fs/ioctl.c	2011-01-18 10:15:17.000000000 +0800
+++ linux/fs/ioctl.c	2011-01-18 10:39:40.000000000 +0800
@@ -530,6 +530,45 @@ static int ioctl_fsthaw(struct file *fil
 }
 
 /*
+ * Copy info about metadata in memory to userspace
+ * Returns:
+ * = 1, one metadata range copied to userspace
+ * = 0, no more metadata
+ * < 0, error
+ */
+static int ioctl_metadata_incore(struct file *filp, void __user *argp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+	struct metadata_incore_args args;
+	loff_t offset;
+	ssize_t size;
+
+	if (!sb->s_op->metadata_incore)
+		return -EINVAL;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	/* we check metadata info in page unit */
+	if (args.offset & ~PAGE_CACHE_MASK)
+		return -EINVAL;
+
+	offset = args.offset;
+
+	if (sb->s_op->metadata_incore(sb, &offset, &size) < 0)
+		return 0;
+
+	args.address = offset;
+	args.size = size;
+	args.unused = 0;
+
+	if (copy_to_user(argp, &args, sizeof(args)))
+		return -EFAULT;
+
+	return 1;
+}
+
+/*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
  *
@@ -589,6 +628,9 @@ int do_vfs_ioctl(struct file *filp, unsi
 		return put_user(inode->i_sb->s_blocksize, p);
 	}
 
+	case FIMETADATA_INCORE:
+		return ioctl_metadata_incore(filp, argp);
+
 	default:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
Index: linux/include/linux/fs.h
==================================================================---
linux.orig/include/linux/fs.h	2011-01-18 10:15:17.000000000 +0800
+++ linux/include/linux/fs.h	2011-01-18 10:39:40.000000000 +0800
@@ -53,6 +53,13 @@ struct inodes_stat_t {
 };
 
 
+struct metadata_incore_args {
+	__u64 offset; /* offset in metadata address */
+	__u64 address; /* returned address of metadata in memory */
+	__u32 size; /* size of the metadata */
+	__u32 unused;
+};
+
 #define NR_FILE  8192	/* this can well be larger on a larger system */
 
 #define MAY_EXEC 1
@@ -327,6 +334,7 @@ struct inodes_stat_t {
 #define FIFREEZE	_IOWR(''X'', 119, int)	/* Freeze */
 #define FITHAW		_IOWR(''X'', 120, int)	/* Thaw */
 #define FITRIM		_IOWR(''X'', 121, struct fstrim_range)	/* Trim
*/
+#define FIMETADATA_INCORE _IOWR(''X'', 122, struct
metadata_incore_args)
 
 #define	FS_IOC_GETFLAGS			_IOR(''f'', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW(''f'', 2, long)
@@ -1626,6 +1634,8 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t,
loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+	int (*metadata_incore)(struct super_block*, loff_t *offset,
+		ssize_t *size);
 };
 
 /*
Index: linux/fs/compat_ioctl.c
==================================================================---
linux.orig/fs/compat_ioctl.c	2011-01-18 09:38:03.000000000 +0800
+++ linux/fs/compat_ioctl.c	2011-01-18 10:39:40.000000000 +0800
@@ -883,6 +883,7 @@ COMPATIBLE_IOCTL(FIGETBSZ)
 /* ''X'' - originally XFS but some now in the VFS */
 COMPATIBLE_IOCTL(FIFREEZE)
 COMPATIBLE_IOCTL(FITHAW)
+COMPATIBLE_IOCTL(FIMETADATA_INCORE)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
 COMPATIBLE_IOCTL(KDGKBTYPE)
@@ -1578,6 +1579,7 @@ asmlinkage long compat_sys_ioctl(unsigne
 	case FIONBIO:
 	case FIOASYNC:
 	case FIOQSIZE:
+	case FIMETADATA_INCORE:
 		break;
 
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2011-Jan-19 20:41 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Wed, 19 Jan 2011 09:15:18 +0800
Shaohua Li <shaohua.li@intel.com> wrote:
> Subject: add metadata_incore ioctl in vfs
> 
> Add an ioctl to dump filesystem''s metadata in memory in vfs.
Userspace collects
> such info and uses it to do metadata readahead.
> Filesystem can hook to super_operations.metadata_incore to get metadata in
> specific approach. Next patch will give an example how to implement
> .metadata_incore in btrfs.
> 
> ...
>  
>  /*
> + * Copy info about metadata in memory to userspace
> + * Returns:
> + * = 1, one metadata range copied to userspace
> + * = 0, no more metadata
> + * < 0, error
> + */
> +static int ioctl_metadata_incore(struct file *filp, void __user *argp)
> +{
> +	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
> +	struct metadata_incore_args args;
> +	loff_t offset;
> +	ssize_t size;
> +
> +	if (!sb->s_op->metadata_incore)
> +		return -EINVAL;
> +
> +	if (copy_from_user(&args, argp, sizeof(args)))
> +		return -EFAULT;
> +
> +	/* we check metadata info in page unit */
> +	if (args.offset & ~PAGE_CACHE_MASK)
> +		return -EINVAL;
> +
> +	offset = args.offset;
> +
> +	if (sb->s_op->metadata_incore(sb, &offset, &size) < 0)
> +		return 0;
> +
> +	args.address = offset;
> +	args.size = size;
> +	args.unused = 0;
> +
> +	if (copy_to_user(argp, &args, sizeof(args)))
> +		return -EFAULT;
> +
> +	return 1;
> +}
So userspace opens any file on the fs and runs this ioctl against it?

That''s a pretty awkward interface - we''re doing an fs-wide
operation
but the fs is identified by a single file which happens to live on that
fs.  For example, this precludes a future extension whereby userspace
can query the incore metadata for a particular file.  The statfs
syscall sucks in the same manner.

I don''t know if this is worth addressing.  Perhaps require that the
filp refers to the root of the fs?


Also, is this a privileged operation?  If not, then that might be a
problem - could it be used by unprivileged users to work out which
files have been opened recently or something like that?

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Shaohua Li

2011-Jan-20 02:30 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 2011-01-20 at 04:41 +0800, Andrew Morton wrote:> On Wed, 19 Jan 2011 09:15:18 +0800
> Shaohua Li <shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
wrote:
> 
> > Subject: add metadata_incore ioctl in vfs
> > 
> > Add an ioctl to dump filesystem''s metadata in memory in vfs.
Userspace collects
> > such info and uses it to do metadata readahead.
> > Filesystem can hook to super_operations.metadata_incore to get
metadata in
> > specific approach. Next patch will give an example how to implement
> > .metadata_incore in btrfs.
> > 
> > ...
> >  
> >  /*
> > + * Copy info about metadata in memory to userspace
> > + * Returns:
> > + * = 1, one metadata range copied to userspace
> > + * = 0, no more metadata
> > + * < 0, error
> > + */
> > +static int ioctl_metadata_incore(struct file *filp, void __user
*argp)
> > +{
> > +	struct super_block *sb =
filp->f_path.dentry->d_inode->i_sb;
> > +	struct metadata_incore_args args;
> > +	loff_t offset;
> > +	ssize_t size;
> > +
> > +	if (!sb->s_op->metadata_incore)
> > +		return -EINVAL;
> > +
> > +	if (copy_from_user(&args, argp, sizeof(args)))
> > +		return -EFAULT;
> > +
> > +	/* we check metadata info in page unit */
> > +	if (args.offset & ~PAGE_CACHE_MASK)
> > +		return -EINVAL;
> > +
> > +	offset = args.offset;
> > +
> > +	if (sb->s_op->metadata_incore(sb, &offset, &size) <
0)
> > +		return 0;
> > +
> > +	args.address = offset;
> > +	args.size = size;
> > +	args.unused = 0;
> > +
> > +	if (copy_to_user(argp, &args, sizeof(args)))
> > +		return -EFAULT;
> > +
> > +	return 1;
> > +}
> 
> So userspace opens any file on the fs and runs this ioctl against it?
> 
> That''s a pretty awkward interface - we''re doing an
fs-wide operation
> but the fs is identified by a single file which happens to live on that
> fs.  For example, this precludes a future extension whereby userspace
> can query the incore metadata for a particular file.  The statfs
> syscall sucks in the same manner.
> 
> I don''t know if this is worth addressing.  Perhaps require that
the
> filp refers to the root of the fs?I didn''t see why this is needed, but I can limit the fip to the root of
the fs.
> Also, is this a privileged operation?  If not, then that might be a
> problem - could it be used by unprivileged users to work out which
> files have been opened recently or something like that?it''s harmless even a unprivileged user uses it. I don''t think
unprivileged user can decode the data returned from the ioctl.

Thanks,
Shaohua

Andrew Morton

2011-Jan-20 02:42 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 20 Jan 2011 10:30:47 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > I don''t know if this is worth addressing.  Perhaps require
that the
> > filp refers to the root of the fs?
> I didn''t see why this is needed, but I can limit the fip to the
root of
> the fs.
I don''t think it matters much either.  The only problem I can see is if
we were to later try to extend the ioctl into a per-file thing.
> > Also, is this a privileged operation?  If not, then that might be a
> > problem - could it be used by unprivileged users to work out which
> > files have been opened recently or something like that?
> it''s harmless even a unprivileged user uses it. I don''t
think
> unprivileged user can decode the data returned from the ioctl.
um.

Well, by doing a before-and-after thing I can use this ioctl to work
out what metadata blocks are used when someone reads
/my/super/secret-directory/foo.  Then I can write a program which sits
there waiting until someone else reads /my/super/secret-directory/foo. 
Then I can use that information to start WWIII or something.

I dunno, strange things happen.  Unless there''s a good *need* to make
this available to unprivileged users then we should not do so.

Shaohua Li

2011-Jan-20 02:48 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 2011-01-20 at 10:42 +0800, Andrew Morton wrote:> On Thu, 20 Jan 2011 10:30:47 +0800 Shaohua Li <shaohua.li@intel.com>
wrote:
> 
> > > I don''t know if this is worth addressing.  Perhaps
require that the
> > > filp refers to the root of the fs?
> > I didn''t see why this is needed, but I can limit the fip to
the root of
> > the fs.
> 
> I don''t think it matters much either.  The only problem I can see
is if
> we were to later try to extend the ioctl into a per-file thing.since we return page range, a metadata page might be shared by several
files, which makes the per-file thing doesn''t work. For a fs using
trees, it''s even more hard to distinguish a file''s metadata
> > > Also, is this a privileged operation?  If not, then that might be
a
> > > problem - could it be used by unprivileged users to work out
which
> > > files have been opened recently or something like that?
> > it''s harmless even a unprivileged user uses it. I
don''t think
> > unprivileged user can decode the data returned from the ioctl.
> 
> um.
> 
> Well, by doing a before-and-after thing I can use this ioctl to work
> out what metadata blocks are used when someone reads
> /my/super/secret-directory/foo.  Then I can write a program which sits
> there waiting until someone else reads /my/super/secret-directory/foo. 
> Then I can use that information to start WWIII or something.
> 
> I dunno, strange things happen.  Unless there''s a good *need* to
make
> this available to unprivileged users then we should not do so.ok, looks interesting, I''ll update the patch to limit unprivileged
users.

Thanks,
Shaohua

--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2011-Jan-20 03:05 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 20 Jan 2011 10:48:33 +0800 Shaohua Li <shaohua.li@intel.com>
wrote:
> On Thu, 2011-01-20 at 10:42 +0800, Andrew Morton wrote:
> > On Thu, 20 Jan 2011 10:30:47 +0800 Shaohua Li
<shaohua.li@intel.com> wrote:
> > 
> > > > I don''t know if this is worth addressing.  Perhaps
require that the
> > > > filp refers to the root of the fs?
> > > I didn''t see why this is needed, but I can limit the fip
to the root of
> > > the fs.
> > 
> > I don''t think it matters much either.  The only problem I can
see is if
> > we were to later try to extend the ioctl into a per-file thing.
> since we return page range, a metadata page might be shared by several
> files, which makes the per-file thing doesn''t work. For a fs using
> trees, it''s even more hard to distinguish a file''s
metadata
hm, why.  A query for "which blocks need to be read to access this
file" may return blocks which are shared with other files, but
it''s
still useful info.  Because it will represent vastly less data (and
hence IO) than the current fs-wide thing.

Now I actually look at it, I cannot find any documentation for the ioctl!  

It seems to return a single offset/length tuple which refers to the
btrfs metadata "file", with the intent that this tuple later be fed
into a btrfs-specific readahead ioctl.

I can see how this might be used with say fatfs or ext3 where all
metadata resides within the blockdev address_space.  But how is a
filesytem which keeps its metadata in multiple address_spaces supposed
to use this interface?

So.  Please fully document the proposed userspace APIs!  This should be
the first thing we look at.  Then we can take a look at how applicable
that is to other-than-btrfs filesystems.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Shaohua Li

2011-Jan-20 03:21 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 2011-01-20 at 11:05 +0800, Andrew Morton wrote:> On Thu, 20 Jan 2011 10:48:33 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> 
> > On Thu, 2011-01-20 at 10:42 +0800, Andrew Morton wrote:
> > > On Thu, 20 Jan 2011 10:30:47 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > > 
> > > > > I don''t know if this is worth addressing. 
Perhaps require that the
> > > > > filp refers to the root of the fs?
> > > > I didn''t see why this is needed, but I can limit
the fip to the root of
> > > > the fs.
> > > 
> > > I don''t think it matters much either.  The only problem
I can see is if
> > > we were to later try to extend the ioctl into a per-file thing.
> > since we return page range, a metadata page might be shared by several
> > files, which makes the per-file thing doesn''t work. For a fs
using
> > trees, it''s even more hard to distinguish a file''s
metadata
> 
> hm, why.  A query for "which blocks need to be read to access this
> file" may return blocks which are shared with other files, but
it''s
> still useful info.  Because it will represent vastly less data (and
> hence IO) than the current fs-wide thing.
ok> Now I actually look at it, I cannot find any documentation for the ioctl!  I''ll write more doc in later post.
> It seems to return a single offset/length tuple which refers to the
> btrfs metadata "file", with the intent that this tuple later be
fed
> into a btrfs-specific readahead ioctl.
> 
> I can see how this might be used with say fatfs or ext3 where all
> metadata resides within the blockdev address_space.  But how is a
> filesytem which keeps its metadata in multiple address_spaces supposed
> to use this interface?Oh, this looks like a big problem, thanks for letting me know such
filesystems. is it possible specific filesystem mapping multiple
address_space ranges to a virtual big ranges? the new ioctls handle the
mapping.
If the issue can''t be solved, we can only add the metadata readahead
for
specific implementation like my initial post instead of a generic
interface.

Thanks,
Shaohua

Andrew Morton

2011-Jan-20 04:10 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 20 Jan 2011 11:21:49 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > It seems to return a single offset/length tuple which refers to the
> > btrfs metadata "file", with the intent that this tuple later
be fed
> > into a btrfs-specific readahead ioctl.
> > 
> > I can see how this might be used with say fatfs or ext3 where all
> > metadata resides within the blockdev address_space.  But how is a
> > filesytem which keeps its metadata in multiple address_spaces supposed
> > to use this interface?
> Oh, this looks like a big problem, thanks for letting me know such
> filesystems. is it possible specific filesystem mapping multiple
> address_space ranges to a virtual big ranges? the new ioctls handle the
> mapping.
I''m not sure what you mean by that.

ext2, minix and probably others create an address_space for each
directory.  Heaven knows what xfs does (for example).
> If the issue can''t be solved, we can only add the metadata
readahead for
> specific implementation like my initial post instead of a generic
> interface.
Well.  One approach would be for the kernel to report the names of all
presently-cached files.  And for each file, report the offsets of all
the pages which are presently in pagecache.  This all gets put into a
database.

At cold-boot time we open all those files and read the relevant files.

To optimise that further, userspace would need to use fibmap to work
out the LBA(s) of each page, and then read the pages in an optimised order.

To optimise that even further, userspace would need to find the on-disk
locations all the metadata for each file, generate the metadata->data
dependencies and then incorporate that into the reading order.

I actually wrote code to do all this.  Gad, it was ten years ago.  I
forget how it works, but I do recall that it pioneered the technology
of doing (effecticely) a sys_write(1, ...) from a kernel module, so the
module''s output appears on modprobe''s stdout and can be
redirected to
another file or a pipe.  So sue me!  It''s in
http://userweb.kernel.org/~akpm/stuff/fboot.tar.gz.  Good luck with
that ;)

<looks>

It walked mem_map[], indentifying pagecache pages, walking back from
the page* all the way to the filename then logging the pathname and the
file''s pagecache indexes.  It also handled the blockdev superblock,
where all the ext3 metadata resides.

There are much smarter ways of doing this of course, especially with
the vfs data structures which we later added.

<googles>

According to http://kerneltrap.org/node/2157 it sped up cold boot by
"10%", whatever that means.  Seems that I wasn''t sufficiently
impressed
by that and got distracted.

I''m not sure any of that was very useful, really.  A full-on coldboot
optimiser really wants visibility into every disk block which need to
be read, and then mechanisms to tell the kernel to load those blocks
into the correct address_spaces.  That''s hard, because file data
depends on file metadata.  A vast simplification would be to do it in
two disk passes: read all the metadata on pass 1 then all the data on
pass 2.

A totally different approach is to reorder all the data and metadata
on-disk, so no special cold-boot processing is needed at all.

And a third approach is to save all the cache into a special
file/partition/etc and to preload all that into kernel data structures
at boot.  Obviously this one is ricky/tricky because the on-disk
replica of the real data can get out of sync with the real data.

Dave Chinner

2011-Jan-20 04:41 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Wed, Jan 19, 2011 at 08:10:14PM -0800, Andrew Morton
wrote:> On Thu, 20 Jan 2011 11:21:49 +0800 Shaohua Li <shaohua.li@intel.com>
wrote:
> 
> > > It seems to return a single offset/length tuple which refers to
the
> > > btrfs metadata "file", with the intent that this tuple
later be fed
> > > into a btrfs-specific readahead ioctl.
> > > 
> > > I can see how this might be used with say fatfs or ext3 where all
> > > metadata resides within the blockdev address_space.  But how is a
> > > filesytem which keeps its metadata in multiple address_spaces
supposed
> > > to use this interface?
> > Oh, this looks like a big problem, thanks for letting me know such
> > filesystems. is it possible specific filesystem mapping multiple
> > address_space ranges to a virtual big ranges? the new ioctls handle
the
> > mapping.
> 
> I''m not sure what you mean by that.
> 
> ext2, minix and probably others create an address_space for each
> directory.  Heaven knows what xfs does (for example).
In 2.6.39 it won''t even use address spaces for metadata caching.

Besides, XFS already has pretty sophisticated metadata readahead
built in - it''s one of the reasons why the XFS directory code scales
so well on cold cache lookups of arge directories - so I don''t see
much need for such an interface for XFS.

Perhaps btrfs would be better served by implementing speculative
metadata readahead in the places where it makes sense (e.g. readdir)
bcause it will improve cold-cache performance on a much wider range
of workloads than at just boot-time....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Shaohua Li

2011-Jan-20 05:38 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 2011-01-20 at 12:10 +0800, Andrew Morton wrote:> On Thu, 20 Jan 2011 11:21:49 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> 
> > > It seems to return a single offset/length tuple which refers to
the
> > > btrfs metadata "file", with the intent that this tuple
later be fed
> > > into a btrfs-specific readahead ioctl.
> > > 
> > > I can see how this might be used with say fatfs or ext3 where all
> > > metadata resides within the blockdev address_space.  But how is a
> > > filesytem which keeps its metadata in multiple address_spaces
supposed
> > > to use this interface?
> > Oh, this looks like a big problem, thanks for letting me know such
> > filesystems. is it possible specific filesystem mapping multiple
> > address_space ranges to a virtual big ranges? the new ioctls handle
the
> > mapping.
> 
> I''m not sure what you mean by that.
> 
> ext2, minix and probably others create an address_space for each
> directory.  Heaven knows what xfs does (for example).yes, this is for one directiory, but the all files''s metadata are in
block_dev address_space.
I thought you mean there are several block_dev address_space like
address_space in some filesystems, which doesn''t fit well in my
implementation. for ext like filesystem, there is only one
address_space. for filesystems with several address_space, my proposal
is map them to a virtual big address_space in the new ioctls.

snip
> I''m not sure any of that was very useful, really.  A full-on
coldboot
> optimiser really wants visibility into every disk block which need to
> be read, and then mechanisms to tell the kernel to load those blocks
> into the correct address_spaces.  That''s hard, because file data
> depends on file metadata.  A vast simplification would be to do it in
> two disk passes: read all the metadata on pass 1 then all the data on
> pass 2.This is exactly what my patch does. We use the new ioctls to do metadata
readahead in first pass, and do data readahead in the second pass.
> A totally different approach is to reorder all the data and metadata
> on-disk, so no special cold-boot processing is needed at all.not feasible for a product and it''s very hard for some filesystmes.
> And a third approach is to save all the cache into a special
> file/partition/etc and to preload all that into kernel data structures
> at boot.  Obviously this one is ricky/tricky because the on-disk
> replica of the real data can get out of sync with the real data.Tricky staff.

Shaohua Li

2011-Jan-20 05:44 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 2011-01-20 at 12:41 +0800, Dave Chinner wrote:> On Wed, Jan 19, 2011 at 08:10:14PM -0800, Andrew Morton wrote:
> > On Thu, 20 Jan 2011 11:21:49 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > 
> > > > It seems to return a single offset/length tuple which refers
to the
> > > > btrfs metadata "file", with the intent that this
tuple later be fed
> > > > into a btrfs-specific readahead ioctl.
> > > > 
> > > > I can see how this might be used with say fatfs or ext3
where all
> > > > metadata resides within the blockdev address_space.  But how
is a
> > > > filesytem which keeps its metadata in multiple
address_spaces supposed
> > > > to use this interface?
> > > Oh, this looks like a big problem, thanks for letting me know
such
> > > filesystems. is it possible specific filesystem mapping multiple
> > > address_space ranges to a virtual big ranges? the new ioctls
handle the
> > > mapping.
> > 
> > I''m not sure what you mean by that.
> > 
> > ext2, minix and probably others create an address_space for each
> > directory.  Heaven knows what xfs does (for example).
> 
> In 2.6.39 it won''t even use address spaces for metadata caching.
> 
> Besides, XFS already has pretty sophisticated metadata readahead
> built in - it''s one of the reasons why the XFS directory code
scales
> so well on cold cache lookups of arge directories - so I don''t see
> much need for such an interface for XFS.
> 
> Perhaps btrfs would be better served by implementing speculative
> metadata readahead in the places where it makes sense (e.g. readdir)
> bcause it will improve cold-cache performance on a much wider range
> of workloads than at just boot-time....I don''t know about xfs. A sophisticated metadata readahead might make
metadata async, but I thought it''s impossible it can removes the disk
seek. Since metadata and data usually lives in different disk block
ranges, doing data readahead will unavoidable read metadata and cause
disk seek between reading data and metadata.

Thanks,
Shaohua

Wu Fengguang

2011-Jan-20 05:46 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, Jan 20, 2011 at 12:10:14PM +0800, Andrew Morton
wrote:> On Thu, 20 Jan 2011 11:21:49 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> 
> > > It seems to return a single offset/length tuple which refers to
the
> > > btrfs metadata "file", with the intent that this tuple
later be fed
> > > into a btrfs-specific readahead ioctl.
> > > 
> > > I can see how this might be used with say fatfs or ext3 where all
> > > metadata resides within the blockdev address_space.  But how is a
> > > filesytem which keeps its metadata in multiple address_spaces
supposed
> > > to use this interface?
> > Oh, this looks like a big problem, thanks for letting me know such
> > filesystems. is it possible specific filesystem mapping multiple
> > address_space ranges to a virtual big ranges? the new ioctls handle
the
> > mapping.
> 
> I''m not sure what you mean by that.
> 
> ext2, minix and probably others create an address_space for each
> directory.  Heaven knows what xfs does (for example).
> 
> > If the issue can''t be solved, we can only add the metadata
readahead for
> > specific implementation like my initial post instead of a generic
> > interface.
> 
> Well.  One approach would be for the kernel to report the names of all
> presently-cached files.  And for each file, report the offsets of all
> the pages which are presently in pagecache.  This all gets put into a
> database.
> 
> At cold-boot time we open all those files and read the relevant files.
> 
> To optimise that further, userspace would need to use fibmap to work
> out the LBA(s) of each page, and then read the pages in an optimised order.
> 
> To optimise that even further, userspace would need to find the on-disk
> locations all the metadata for each file, generate the metadata->data
> dependencies and then incorporate that into the reading order.
> 
> I actually wrote code to do all this.  Gad, it was ten years ago.  I
> forget how it works, but I do recall that it pioneered the technology
> of doing (effecticely) a sys_write(1, ...) from a kernel module, so the
> module''s output appears on modprobe''s stdout and can be
redirected to
> another file or a pipe.  So sue me!  It''s in
> http://userweb.kernel.org/~akpm/stuff/fboot.tar.gz.  Good luck with
> that ;)
> 
> <looks>
> 
> It walked mem_map[], indentifying pagecache pages, walking back from
> the page* all the way to the filename then logging the pathname and the
> file''s pagecache indexes.  It also handled the blockdev
superblock,
> where all the ext3 metadata resides.
 > There are much smarter ways of doing this of course, especially with
> the vfs data structures which we later added.
Yup :) The attached patch walks sb->s_inodes and dumps a ordered view
of all cached file pages.  It will list each cached files and pages in
the order of the struct inode create time.

The patch will record and show the command name that first opened the
file.  (At the time we dump the page cache, the task may no longer
exists.) Although the field is very useful in some cases, it does add
runtime overheads.  I''m not sure how to balance this situation. Adding
a compile time option? But then the trace output becomes dependent on
kernel configuration, which may confuse user space tools (at least the
dumb ones).

Otherwise the patch is good enough for wider review.

Here is a trimmed example output.

root@bay /home/wfg# echo / > /debug/tracing/objects/mm/pages/dump-fs
root@bay /home/wfg# cat /debug/tracing/trace

The output are made of intermixed lines for inode and page. The
corresponding field names are:

file lines:
          ino         size       cached      age(ms) dirty type first-opened-by
file-name

page lines:
       index    len  page-flags count mapcount 

      1507329         4096         8192       309042 ____  DIR          swapper
/
           0      2 ____RU_____    1    0
      1786836        12288        40960       309026 ____  DIR          swapper
/sbin
           0     10 ___ARU_____    1    0
      1786946        37312        40960       309024 ____  REG          swapper
/sbin/init
           0      6 M__ARU_____    2    1
           6      1 M__A_U_____    2    1
           7      1 M__ARU_____    2    1
           8      2 _____U_____    1    0
      1507464            4         4096       309022 ____  LNK          swapper
/lib64
           0      1 ___ARU_____    1    0
      1590173        12288            0       309021 ____  DIR          swapper
/lib
      4563326           12         4096       309020 ____  LNK          swapper
/lib/ld-linux-x86-64.so.2
           0      1 ___ARU_____    1    0
      4563295       128744       131072       309019 ____  REG          swapper
/lib/ld-2.11.2.so
           0      1 M__ARU_____   21   20
           1      3 M__ARU_____   17   16
           4      4 M__ARU_____   20   19
           8      2 M__ARU_____   27   26
          10      3 M__ARU_____   20   19
          13      1 M__ARU_____   27   26
          14      1 M__ARU_____   26   25
          15      1 M__ARU_____   20   19
          16      1 M__ARU_____   18   17
          17      1 M__ARU_____    9    8
          18      1 M__A_U_____    4    3
          19      1 M__ARU_____   27   26
          20      1 M__ARU_____   17   16
          21      1 M__ARU_____   20   19
          22      1 M__ARU_____   27   26
          23      1 M__ARU_____   20   19
          24      1 M__ARU_____   26   25
          25      1 _____U_____    1    0
          26      1 M__A_U_____    4    3
          27      1 M__ARU_____   20   19
          28      4 _____U_____    1    0
      1525477        12288            0       309011 ____  DIR             init
/etc
      1526463        64634        65536       309009 ____  REG             init
/etc/ld.so.cache
           0      1 ___ARU_____    1    0
           1      1 _____U_____    1    0
           2     13 ___ARU_____    1    0
          15      1 ____RU_____    1    0
      1590258       241632       241664       309005 ____  REG             init
/lib/libsepol.so.1
           0      5 M__ARU_____    2    1
           5     42 _____U_____    1    0
          47      1 M__ARU_____    2    1
          48     11 _____U_____    1    0
      1590330       117848       118784       308989 ____  REG             init
/lib/libselinux.so.1
           0      1 M__ARU_____    7    6
           1      4 M__ARU_____    4    3
           5      1 M__ARU_____    5    4
           6      5 _____U_____    1    0
          11      2 M__ARU_____    4    3
          13      5 _____U_____    1    0
          18      1 ___ARU_____    1    0
          19      2 _____U_____    1    0
          21      1 M__ARU_____    5    4
          22      7 _____U_____    1    0
      4563314           14         4096       308982 ____  LNK             init
/lib/libc.so.6
           0      1 ___ARU_____    1    0
      4563283      1432968      1433600       308981 ____  REG             init
/lib/libc-2.11.2.so
           0      3 M__ARU_____   27   26
           3      1 M__ARU_____   25   24
           4      2 M__ARU_____   23   22
           6      1 M__ARU_____   26   25
           7      1 M__ARU_____   22   21
           8      1 M__ARU_____   27   26
           9      2 M__ARU_____   25   24
          11      1 M__ARU_____   23   22
          12      1 M__ARU_____   25   24
          13      1 M__ARU_____   24   23
          14      1 M__ARU_____   25   24
          15      3 M__ARU_____   24   23
          18      3 M__ARU_____   26   25
          21      2 M__ARU_____   27   26
          23      7 M__ARU_____   17   16
          30      1 M__ARU_____   29   28
          31      1 M__ARU_____   25   24
          32      2 M__ARU_____    4    3
          34      1 M__ARU_____    3    2
          35      2 M__ARU_____    4    3
          37      1 M__ARU_____    2    1
          38      1 _____U_____    1    0
          39      1 M__ARU_____    4    3
          40      1 M__ARU_____   13   12
          41      1 M__ARU_____   12   11
          42      1 M__ARU_____    5    4
          43      1 M__ARU_____   23   22
          44      2 M__ARU_____    6    5
          46      1 ___ARU_____    1    0
          47      1 M__ARU_____   12   11
          48      1 M__ARU_____    4    3
          49      1 M__ARU_____   18   17
          50      1 M__ARU_____   29   28
          51      2 M__ARU_____    2    1
          53      1 M__ARU_____   27   26
          54      1 M__ARU_____   19   18
          55      1 M__ARU_____   25   24
          56      2 _____U_____    1    0
          58      2 M__ARU_____    2    1
          60      2 _____U_____    1    0
          62      1 M__A_U_____    2    1
          63      1 _____U_____    1    0
          64      1 ___ARU_____    1    0
          65      3 M__ARU_____   29   28
          68      1 M__ARU_____   21   20
          69      1 M__ARU_____   26   25
          70      1 M__ARU_____    9    8
          71      1 M__ARU_____    3    2
          72      2 ___ARU_____    1    0
          74      2 _____U_____    1    0
          76      1 M__ARU_____   27   26
          77      2 M__ARU_____   13   12
          79      1 M__ARU_____    9    8
          80      1 M__ARU_____   10    9
          81      1 M__A_U_____    2    1
          82      1 M___RU_____    4    3
          83      1 M__ARU_____    3    2
          84      1 M__ARU_____   16   15
          85      1 M__ARU_____    3    2
          86     12 _____U_____    1    0
          98      1 M__ARU_____   26   25
          99      1 M__ARU_____   25   24
         100      2 M__ARU_____   17   16
         102      1 M__ARU_____   25   24
         103      1 M__ARU_____   18   17
         104      1 M__ARU_____   14   13
         105      3 _____U_____    1    0
         108      1 M__ARU_____   12   11
         109      2 M__ARU_____   26   25
         111      6 M__ARU_____   30   29
         117      1 M__ARU_____   29   28
         118      1 M__ARU_____   30   29
         119      1 M__ARU_____   19   18
         120      1 M__ARU_____   22   21
         121      1 M__ARU_____    3    2
         122      1 M__ARU_____   28   27
         123      1 M__ARU_____   30   29
         124      1 M__ARU_____   11   10
         125      1 M__ARU_____   26   25
         126      1 M__ARU_____   22   21
         127      2 M__ARU_____   29   28
         129      2 M__ARU_____    5    4
         131      1 M__ARU_____   10    9
         132      1 M__ARU_____   25   24
         133      2 M__ARU_____   17   16
         135      1 M__ARU_____    3    2
         136      6 _____U_____    1    0
         142      2 M__ARU_____    3    2
         144      1 M__ARU_____    8    7
         145      1 M__ARU_____   22   21
         146      3 M__ARU_____    8    7
         149      2 _____U_____    1    0
         151      3 M__ARU_____    6    5
         154      2 _____U_____    1    0
         156      1 M__ARU_____    8    7
         157      1 M__ARU_____   10    9
         158      1 M__ARU_____    9    8
         159      1 M__ARU_____    8    7
         160      1 M__ARU_____   28   27
         161      1 M__ARU_____   30   29
         162      1 M__ARU_____   14   13
         163      1 M____U_____    2    1
         164      2 _____U_____    1    0
         166      2 M__ARU_____    4    3
         168      1 M__ARU_____   12   11
         169      1 M__ARU_____   10    9
         170      1 M__ARU_____    4    3
         171      3 M__ARU_____    3    2
         174      6 ___ARU_____    1    0
         180      1 _____U_____    1    0
         181      9 ___ARU_____    1    0
         190      1 M__ARU_____    4    3
         191      1 ___A_U_____    1    0
         192      1 _____U_____    1    0
         193      1 ___A_U_____    1    0
         194      1 M__ARU_____   30   29
         195      1 M__ARU_____   27   26
         196      1 M__ARU_____   17   16
         197      2 _____U_____    1    0
         199      1 M__ARU_____   27   26
         200      1 M__ARU_____   25   24
         201      1 M__ARU_____    2    1
         202      1 M__ARU_____    9    8
         203      1 M__ARU_____   26   25
         204      1 M__ARU_____   14   13
         205      1 M__ARU_____    4    3
         206      1 M__ARU_____   18   17
         207      1 M__ARU_____   26   25
         208      1 M__ARU_____   22   21
         209      1 M__ARU_____    2    1
         210      1 M__ARU_____    3    2
         211      2 M____U_____    2    1
         213      5 _____U_____    1    0
         218      1 ___A_U_____    1    0
> <googles>
> 
> According to http://kerneltrap.org/node/2157 it sped up cold boot by
> "10%", whatever that means.  Seems that I wasn''t
sufficiently impressed
> by that and got distracted.
>
> I''m not sure any of that was very useful, really.  A full-on
coldboot
> optimiser really wants visibility into every disk block which need to
> be read, and then mechanisms to tell the kernel to load those blocks
> into the correct address_spaces.  That''s hard, because file data
> depends on file metadata.  A vast simplification would be to do it in
> two disk passes: read all the metadata on pass 1 then all the data on
> pass 2.
Yes, that is what this patchset tries to do.
> A totally different approach is to reorder all the data and metadata
> on-disk, so no special cold-boot processing is needed at all.
The boot time speedup mentioned in the changelog won''t be possible
without the physical data/metadata reordering. Fortunately btrfs makes
it a trivial task.
> And a third approach is to save all the cache into a special
> file/partition/etc and to preload all that into kernel data structures
> at boot.  Obviously this one is ricky/tricky because the on-disk
> replica of the real data can get out of sync with the real data.
Hah! We are thinking much alike :)

It''s a very good optimization for LiveCDs and readonly mounted NFS
/usr.

For a typical desktop, the solution in my mind is to install some
initscript to run at halt/reboot time, after all other tasks have been
killed and filesystems remounted readonly.  At the time it may dump
whatever in the page cache to the swap partition. At the next boot,
the data/metadata can then be read back _perfectly sequentially_ for
populating the page cache.

For kexec based reboot, the data can even be passed to next kernel
directly, saving the disk IO totally.

Thanks,
Fengguang

Andrew Morton

2011-Jan-20 05:55 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 20 Jan 2011 13:38:18 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > ext2, minix and probably others create an address_space for each
> > directory.  Heaven knows what xfs does (for example).
> yes, this is for one directiory, but the all files''s metadata are
in
> block_dev address_space.
> I thought you mean there are several block_dev address_space like
> address_space in some filesystems, which doesn''t fit well in my
> implementation. for ext like filesystem, there is only one
> address_space. for filesystems with several address_space, my proposal
> is map them to a virtual big address_space in the new ioctls.
ext2 and minixfs (and I think sysv and ufs) have a separate
address_space for each directory.  I don''t see how those can be
represented with a single "virtual big address_space" - we also need
identifiers in there so each directory''s address_space can be created
and appropriately populated.

Wu Fengguang

2011-Jan-20 05:55 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

> > And a third approach is to save all the cache into a special
> > file/partition/etc and to preload all that into kernel data structures
> > at boot.  Obviously this one is ricky/tricky because the on-disk
> > replica of the real data can get out of sync with the real data.
> 
> Hah! We are thinking much alike :)
> 
> It''s a very good optimization for LiveCDs and readonly mounted NFS
/usr.
> 
> For a typical desktop, the solution in my mind is to install some
> initscript to run at halt/reboot time, after all other tasks have been
> killed and filesystems remounted readonly.  At the time it may dump
> whatever in the page cache to the swap partition. At the next boot,
Not "whatever", to be practical. One obvious optimization is to filter
out large cached files, which can be read _sequentially_ at next boot
anyway. It''s a technique mainly for reducing seeks at the cost of some
extra writes at halt/reboot time. And I suspect btrfs will obsolete
this technique, as it can already pack data/metadata nicely.

Thanks,
Fengguang
> the data/metadata can then be read back _perfectly sequentially_ for
> populating the page cache.
> 
> For kexec based reboot, the data can even be passed to next kernel
> directly, saving the disk IO totally.
> 
> Thanks,
> Fengguang
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ mmotm/include/trace/events/mm.h	2010-12-26 20:59:48.000000000 +0800
> @@ -0,0 +1,164 @@
> +#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_MM_H
> +
> +#include <linux/tracepoint.h>
> +#include <linux/page-flags.h>
> +#include <linux/memcontrol.h>
> +#include <linux/pagemap.h>
> +#include <linux/mm.h>
> +#include <linux/kernel-page-flags.h>
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM mm
> +
> +extern struct trace_print_flags pageflag_names[];
> +
> +/**
> + * dump_page_frame - called by the trace page dump trigger
> + * @pfn: page frame number
> + * @page: pointer to the page frame
> + *
> + * This is a helper trace point into the dumping of the page frames.
> + * It will record various infromation about a page frame.
> + */
> +TRACE_EVENT(dump_page_frame,
> +
> +	TP_PROTO(unsigned long pfn, struct page *page),
> +
> +	TP_ARGS(pfn, page),
> +
> +	TP_STRUCT__entry(
> +		__field(	unsigned long,	pfn		)
> +		__field(	struct page *,	page		)
> +		__field(	u64,		stable_flags	)
> +		__field(	unsigned long,	flags		)
> +		__field(	unsigned int,	count		)
> +		__field(	unsigned int,	mapcount	)
> +		__field(	unsigned long,	private		)
> +		__field(	unsigned long,	mapping		)
> +		__field(	unsigned long,	index		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->pfn		= pfn;
> +		__entry->page		= page;
> +		__entry->stable_flags	= stable_page_flags(page);
> +		__entry->flags		= page->flags;
> +		__entry->count		= atomic_read(&page->_count);
> +		__entry->mapcount	= page_mapcount(page);
> +		__entry->private	= page->private;
> +		__entry->mapping	= (unsigned long)page->mapping;
> +		__entry->index		= page->index;
> +	),
> +
> +	TP_printk("%12lx %16p %8x %8x %16lx %16lx %16lx %s",
> +		  __entry->pfn,
> +		  __entry->page,
> +		  __entry->count,
> +		  __entry->mapcount,
> +		  __entry->private,
> +		  __entry->mapping,
> +		  __entry->index,
> +		  ftrace_print_flags_seq(p, "|",
> +					 __entry->flags & PAGE_FLAGS_MASK,
> +					 pageflag_names)
> +	)
> +);
> +
> +TRACE_EVENT(dump_page_cache,
> +
> +	TP_PROTO(struct page *page, unsigned long len),
> +
> +	TP_ARGS(page, len),
> +
> +	TP_STRUCT__entry(
> +		__field(	unsigned long,	index		)
> +		__field(	unsigned long,	len		)
> +		__field(	u64,		flags		)
> +		__field(	unsigned int,	count		)
> +		__field(	unsigned int,	mapcount	)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->index		= page->index;
> +		__entry->len		= len;
> +		__entry->flags		= stable_page_flags(page);
> +		__entry->count		= atomic_read(&page->_count);
> +		__entry->mapcount	= page_mapcount(page);
> +	),
> +
> +	TP_printk("%12lu %6lu %c%c%c%c%c%c%c%c%c%c%c %4u %4u",
> +		  __entry->index,
> +		  __entry->len,
> +		  __entry->flags & (1ULL << KPF_MMAP)		?
''M'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_MLOCKED)	?
''m'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_UNEVICTABLE)	?
''u'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_ACTIVE)		?
''A'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_REFERENCED)	?
''R'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_UPTODATE)	?
''U'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_DIRTY)		?
''D'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_WRITEBACK)	?
''W'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_RECLAIM)	?
''I'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_MAPPEDTODISK)	?
''d'' : ''_'',
> +		  __entry->flags & (1ULL << KPF_PRIVATE)	?
''P'' : ''_'',
> +		  __entry->count,
> +		  __entry->mapcount)
> +);
> +
> +
> +#define show_inode_type(val)	__print_symbolic(val, 	   \
> +				{ S_IFREG,	"REG"	}, \
> +				{ S_IFDIR,	"DIR"	}, \
> +				{ S_IFLNK,	"LNK"	}, \
> +				{ S_IFBLK,	"BLK"	}, \
> +				{ S_IFCHR,	"CHR"	}, \
> +				{ S_IFIFO,	"FIFO"	}, \
> +				{ S_IFSOCK,	"SOCK"	})
> +
> +TRACE_EVENT(dump_inode_cache,
> +
> +	TP_PROTO(struct inode *inode, char *name, int len),
> +
> +	TP_ARGS(inode, name, len),
> +
> +	TP_STRUCT__entry(
> +		__field(	unsigned long,	ino		)
> +		__field(	loff_t,		size		) /* bytes */
> +		__field(	loff_t,		cached		) /* bytes */
> +		__field(	unsigned long,	age		) /*    ms */
> +		__field(	unsigned long,	state		)
> +		__field(	umode_t,	mode		)
> +		__array(	char,		comm, TASK_COMM_LEN)
> +		__dynamic_array(char,		file,	len	)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->ino	= inode->i_ino;
> +		__entry->size	= i_size_read(inode);
> +		__entry->cached	= inode->i_mapping->nrpages;
> +		__entry->cached	<<= PAGE_CACHE_SHIFT;
> +		__entry->age	= (jiffies - inode->dirtied_when) * 1000 / HZ;
> +		__entry->state	= inode->i_state;
> +		__entry->mode	= inode->i_mode;
> +		memcpy(__entry->comm, inode->i_comm, TASK_COMM_LEN);
> +		memcpy(__get_str(file), name, len);
> +	),
> +
> +	TP_printk("%12lu %12llu %12llu %12lu %c%c%c%c %4s %16s %s",
> +		  __entry->ino,
> +		  __entry->size,
> +		  __entry->cached,
> +		  __entry->age,
> +		  __entry->state & I_DIRTY_PAGES	? ''D'' :
''_'',
> +		  __entry->state & I_DIRTY_DATASYNC	? ''d'' :
''_'',
> +		  __entry->state & I_DIRTY_SYNC		? ''m'' :
''_'',
> +		  __entry->state & I_SYNC		? ''S'' :
''_'',
> +		  show_inode_type(__entry->mode & S_IFMT),
> +		  __entry->comm,
> +		  __get_str(file))
> +);
> +
> +#endif /*  _TRACE_MM_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> --- mmotm.orig/kernel/trace/Makefile	2010-12-26 20:58:46.000000000 +0800
> +++ mmotm/kernel/trace/Makefile	2010-12-26 20:59:41.000000000 +0800
> @@ -26,6 +26,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer
>  obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
>  
>  obj-$(CONFIG_TRACING) += trace.o
> +obj-$(CONFIG_TRACING) += trace_objects.o
>  obj-$(CONFIG_TRACING) += trace_output.o
>  obj-$(CONFIG_TRACING) += trace_stat.o
>  obj-$(CONFIG_TRACING) += trace_printk.o
> @@ -53,6 +54,7 @@ endif
>  obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
>  obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
>  obj-$(CONFIG_EVENT_TRACING) += power-traces.o
> +obj-$(CONFIG_EVENT_TRACING) += trace_mm.o
>  ifeq ($(CONFIG_TRACING),y)
>  obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
>  endif
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ mmotm/kernel/trace/trace_mm.c	2010-12-26 20:59:41.000000000 +0800
> @@ -0,0 +1,367 @@
> +/*
> + * Trace mm pages
> + *
> + * Copyright (C) 2009 Red Hat Inc, Steven Rostedt
<srostedt-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> + *
> + * Code based on Matt Mackall''s /proc/[kpagecount|kpageflags]
code.
> + */
> +#include <linux/module.h>
> +#include <linux/bootmem.h>
> +#include <linux/debugfs.h>
> +#include <linux/uaccess.h>
> +#include <linux/ctype.h>
> +#include <linux/pagevec.h>
> +#include <linux/writeback.h>
> +#include <linux/file.h>
> +#include <linux/slab.h>
> +
> +#include "trace_output.h"
> +
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/mm.h>
> +
> +void trace_mm_page_frames(unsigned long start, unsigned long end,
> +			  void (*trace)(unsigned long pfn, struct page *page))
> +{
> +	unsigned long pfn = start;
> +	struct page *page;
> +
> +	if (start > max_pfn - 1)
> +		return;
> +
> +	if (end > max_pfn)
> +		end = max_pfn;
> +
> +	while (pfn < end) {
> +		page = NULL;
> +		if (pfn_valid(pfn))
> +			page = pfn_to_page(pfn);
> +		pfn++;
> +		if (page)
> +			trace(pfn, page);
> +	}
> +}
> +
> +static void trace_mm_page_frame(unsigned long pfn, struct page *page)
> +{
> +	trace_dump_page_frame(pfn, page);
> +}
> +
> +static ssize_t
> +trace_mm_pfn_range_read(struct file *filp, char __user *ubuf, size_t cnt,
> +			loff_t *ppos)
> +{
> +	return simple_read_from_buffer(ubuf, cnt, ppos, "0\n", 2);
> +}
> +
> +
> +/*
> + * recognized formats:
> + * 		"M N"	start=M, end=N
> + * 		"M"	start=M, end=M+1
> + * 		"M +N"	start=M, end=M+N-1
> + */
> +static ssize_t
> +trace_mm_pfn_range_write(struct file *filp, const char __user *ubuf,
size_t cnt,
> +			 loff_t *ppos)
> +{
> +	unsigned long start;
> +	unsigned long end = 0;
> +	char buf[64];
> +	char *ptr;
> +
> +	if (cnt >= sizeof(buf))
> +		return -EINVAL;
> +
> +	if (copy_from_user(&buf, ubuf, cnt))
> +		return -EFAULT;
> +
> +	if (tracing_update_buffers() < 0)
> +		return -ENOMEM;
> +
> +	if (trace_set_clr_event("mm", "dump_page_frame", 1))
> +		return -EINVAL;
> +
> +	buf[cnt] = 0;
> +
> +	start = simple_strtoul(buf, &ptr, 0);
> +
> +	for (; *ptr; ptr++) {
> +		if (isdigit(*ptr)) {
> +			if (*(ptr - 1) == ''+'')
> +				end = start;
> +			end += simple_strtoul(ptr, NULL, 0);
> +			break;
> +		}
> +	}
> +	if (!*ptr)
> +		end = start + 1;
> +
> +	trace_mm_page_frames(start, end, trace_mm_page_frame);
> +
> +	return cnt;
> +}
> +
> +static const struct file_operations trace_mm_fops = {
> +	.open		= tracing_open_generic,
> +	.read		= trace_mm_pfn_range_read,
> +	.write		= trace_mm_pfn_range_write,
> +};
> +
> +static struct dentry *trace_objects_mm_dir(void)
> +{
> +	static struct dentry *d_mm;
> +	struct dentry *d_objects;
> +
> +	if (d_mm)
> +		return d_mm;
> +
> +	d_objects = trace_objects_dir();
> +	if (!d_objects)
> +		return NULL;
> +
> +	d_mm = debugfs_create_dir("mm", d_objects);
> +	if (!d_mm)
> +		pr_warning("Could not create ''objects/mm''
directory\n");
> +
> +	return d_mm;
> +}
> +
> +static unsigned long page_flags(struct page *page)
> +{
> +	return page->flags & ((1 << NR_PAGEFLAGS) - 1);
> +}
> +
> +static int pages_similar(struct page *page0, struct page *page)
> +{
> +	if (page_flags(page0) != page_flags(page))
> +		return 0;
> +
> +	if (page_count(page0) != page_count(page))
> +		return 0;
> +
> +	if (page_mapcount(page0) != page_mapcount(page))
> +		return 0;
> +
> +	return 1;
> +}
> +
> +static void dump_pagecache(struct address_space *mapping)
> +{
> +	unsigned long nr_pages;
> +	struct page *pages[PAGEVEC_SIZE];
> +	struct page *uninitialized_var(page0);
> +	struct page *page;
> +	unsigned long start = 0;
> +	unsigned long len = 0;
> +	int i;
> +
> +	for (;;) {
> +		rcu_read_lock();
> +		nr_pages = radix_tree_gang_lookup(&mapping->page_tree,
> +				(void **)pages, start + len, PAGEVEC_SIZE);
> +		rcu_read_unlock();
> +
> +		if (nr_pages == 0) {
> +			if (len)
> +				trace_dump_page_cache(page0, len);
> +			return;
> +		}
> +
> +		for (i = 0; i < nr_pages; i++) {
> +			page = pages[i];
> +
> +			if (len &&
> +			    page->index == start + len &&
> +			    pages_similar(page0, page))
> +				len++;
> +			else {
> +				if (len)
> +					trace_dump_page_cache(page0, len);
> +				page0 = page;
> +				start = page->index;
> +				len = 1;
> +			}
> +		}
> +		cond_resched();
> +	}
> +}
> +
> +static void dump_inode_cache(struct inode *inode,
> +			     char *name_buf,
> +			     struct vfsmount *mnt)
> +{
> +	struct path path = {
> +		.mnt = mnt,
> +		.dentry = d_find_alias(inode)
> +	};
> +	char *name;
> +	int len;
> +
> +	if (!mnt) {
> +		trace_dump_inode_cache(inode, name_buf, strlen(name_buf));
> +		return;
> +	}
> +
> +	if (!path.dentry) {
> +		trace_dump_inode_cache(inode, "", 1);
> +		return;
> +	}
> +
> +	name = d_path(&path, name_buf, PAGE_SIZE);
> +	if (IS_ERR(name)) {
> +		name = "";
> +		len = 1;
> +	} else
> +		len = PAGE_SIZE + name_buf - name;
> +
> +	trace_dump_inode_cache(inode, name, len);
> +
> +	if (path.dentry)
> +		dput(path.dentry);
> +}
> +
> +static void dump_fs_pagecache(struct super_block *sb, struct vfsmount
*mnt)
> +{
> +	struct inode *inode;
> +	struct inode *prev_inode = NULL;
> +	char *name_buf;
> +
> +	name_buf = (char *)__get_free_page(GFP_TEMPORARY);
> +	if (!name_buf)
> +		return;
> +
> +	down_read(&sb->s_umount);
> +	if (!sb->s_root)
> +		goto out;
> +
> +	spin_lock(&inode_lock);
> +	list_for_each_entry_reverse(inode, &sb->s_inodes, i_sb_list) {
> +		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
> +			continue;
> +		__iget(inode);
> +		spin_unlock(&inode_lock);
> +		dump_inode_cache(inode, name_buf, mnt);
> +		if (inode->i_mapping->nrpages)
> +			dump_pagecache(inode->i_mapping);
> +		iput(prev_inode);
> +		prev_inode = inode;
> +		cond_resched();
> +		spin_lock(&inode_lock);
> +	}
> +	spin_unlock(&inode_lock);
> +	iput(prev_inode);
> +out:
> +	up_read(&sb->s_umount);
> +	free_page((unsigned long)name_buf);
> +}
> +
> +static ssize_t
> +trace_pagecache_write(struct file *filp, const char __user *ubuf, size_t
count,
> +		      loff_t *ppos)
> +{
> +	struct file *file = NULL;
> +	char *name;
> +	int err = 0;
> +
> +	if (count <= 1)
> +		return -EINVAL;
> +	if (count >= PAGE_SIZE)
> +		return -ENAMETOOLONG;
> +
> +	name = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!name)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(name, ubuf, count)) {
> +		err = -EFAULT;
> +		goto out;
> +	}
> +
> +	/* strip the newline added by `echo` */
> +	if (name[count-1] == ''\n'')
> +		name[count-1] = ''\0'';
> +	else
> +		name[count] = ''\0'';
> +
> +	file = filp_open(name, O_RDONLY|O_LARGEFILE, 0);
> +	if (IS_ERR(file)) {
> +		err = PTR_ERR(file);
> +		file = NULL;
> +		goto out;
> +	}
> +
> +	if (tracing_update_buffers() < 0) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +	if (trace_set_clr_event("mm", "dump_page_cache", 1))
{
> +		err = -EINVAL;
> +		goto out;
> +	}
> +	if (trace_set_clr_event("mm", "dump_inode_cache", 1))
{
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (filp->f_path.dentry->d_inode->i_private) {
> +		dump_fs_pagecache(file->f_path.dentry->d_sb, file->f_path.mnt);
> +	} else {
> +		dump_inode_cache(file->f_mapping->host, name, NULL);
> +		dump_pagecache(file->f_mapping);
> +	}
> +
> +out:
> +	if (file)
> +		fput(file);
> +	kfree(name);
> +
> +	return err ? err : count;
> +}
> +
> +static const struct file_operations trace_pagecache_fops = {
> +	.open		= tracing_open_generic,
> +	.read		= trace_mm_pfn_range_read,
> +	.write		= trace_pagecache_write,
> +};
> +
> +static struct dentry *trace_objects_mm_pages_dir(void)
> +{
> +	static struct dentry *d_pages;
> +	struct dentry *d_mm;
> +
> +	if (d_pages)
> +		return d_pages;
> +
> +	d_mm = trace_objects_mm_dir();
> +	if (!d_mm)
> +		return NULL;
> +
> +	d_pages = debugfs_create_dir("pages", d_mm);
> +	if (!d_pages)
> +		pr_warning("Could not create debugfs "
> +			   "''objects/mm/pages'' directory\n");
> +
> +	return d_pages;
> +}
> +
> +static __init int trace_objects_mm_init(void)
> +{
> +	struct dentry *d_pages;
> +
> +	d_pages = trace_objects_mm_pages_dir();
> +	if (!d_pages)
> +		return 0;
> +
> +	trace_create_file("dump-pfn", 0600, d_pages, NULL,
> +			  &trace_mm_fops);
> +
> +	trace_create_file("dump-file", 0600, d_pages, NULL,
> +			  &trace_pagecache_fops);
> +
> +	trace_create_file("dump-fs", 0600, d_pages, (void *)1,
> +			  &trace_pagecache_fops);
> +
> +	return 0;
> +}
> +fs_initcall(trace_objects_mm_init);
> --- mmotm.orig/kernel/trace/trace.h	2010-12-26 20:58:46.000000000 +0800
> +++ mmotm/kernel/trace/trace.h	2010-12-26 20:59:41.000000000 +0800
> @@ -295,6 +295,7 @@ struct dentry *trace_create_file(const c
>  				 const struct file_operations *fops);
>  
>  struct dentry *tracing_init_dentry(void);
> +struct dentry *trace_objects_dir(void);
>  
>  struct ring_buffer_event;
>  
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ mmotm/kernel/trace/trace_objects.c	2010-12-26 20:59:41.000000000 +0800
> @@ -0,0 +1,26 @@
> +#include <linux/debugfs.h>
> +
> +#include "trace.h"
> +#include "trace_output.h"
> +
> +struct dentry *trace_objects_dir(void)
> +{
> +	static struct dentry *d_objects;
> +	struct dentry *d_tracer;
> +
> +	if (d_objects)
> +		return d_objects;
> +
> +	d_tracer = tracing_init_dentry();
> +	if (!d_tracer)
> +		return NULL;
> +
> +	d_objects = debugfs_create_dir("objects", d_tracer);
> +	if (!d_objects)
> +		pr_warning("Could not create debugfs "
> +			   "''objects'' directory\n");
> +
> +	return d_objects;
> +}
> +
> +
> --- mmotm.orig/mm/page_alloc.c	2010-12-26 20:58:46.000000000 +0800
> +++ mmotm/mm/page_alloc.c	2010-12-26 20:59:41.000000000 +0800
> @@ -5493,7 +5493,7 @@ bool is_free_buddy_page(struct page *pag
>  }
>  #endif
>  
> -static struct trace_print_flags pageflag_names[] = {
> +struct trace_print_flags pageflag_names[] = {
>  	{1UL << PG_locked,		"locked"	},
>  	{1UL << PG_error,		"error"		},
>  	{1UL << PG_referenced,		"referenced"	},
> @@ -5541,7 +5541,7 @@ static void dump_page_flags(unsigned lon
>  	printk(KERN_ALERT "page flags: %#lx(", flags);
>  
>  	/* remove zone id */
> -	flags &= (1UL << NR_PAGEFLAGS) - 1;
> +	flags &= PAGE_FLAGS_MASK;
>  
>  	for (i = 0; pageflag_names[i].name && flags; i++) {
>  
> --- mmotm.orig/include/linux/page-flags.h	2010-12-26 20:58:46.000000000
+0800
> +++ mmotm/include/linux/page-flags.h	2010-12-26 20:59:41.000000000 +0800
> @@ -414,6 +414,7 @@ static inline void __ClearPageTail(struc
>   * there has been a kernel bug or struct page corruption.
>   */
>  #define PAGE_FLAGS_CHECK_AT_PREP	((1 << NR_PAGEFLAGS) - 1)
> +#define PAGE_FLAGS_MASK			((1 << NR_PAGEFLAGS) - 1)
>  
>  #define PAGE_FLAGS_PRIVATE				\
>  	(1 << PG_private | 1 << PG_private_2)
> --- mmotm.orig/fs/inode.c	2010-12-26 20:58:45.000000000 +0800
> +++ mmotm/fs/inode.c	2010-12-26 21:00:09.000000000 +0800
> @@ -182,7 +182,13 @@ int inode_init_always(struct super_block
>  	inode->i_bdev = NULL;
>  	inode->i_cdev = NULL;
>  	inode->i_rdev = 0;
> -	inode->dirtied_when = 0;
> +
> +	/*
> +	 * This records inode load time. It will be invalidated once inode is
> +	 * dirtied, or jiffies wraps around. Despite the pitfalls it still
> +	 * provides useful information for some use cases like fastboot.
> +	 */
> +	inode->dirtied_when = jiffies;
>  
>  	if (security_inode_alloc(inode))
>  		goto out;
> @@ -226,6 +232,9 @@ int inode_init_always(struct super_block
>  
>  	percpu_counter_inc(&nr_inodes);
>  
> +	BUILD_BUG_ON(sizeof(inode->i_comm) != TASK_COMM_LEN);
> +	memcpy(inode->i_comm, current->comm, TASK_COMM_LEN);
> +
>  	return 0;
>  out:
>  	return -ENOMEM;
> --- mmotm.orig/include/linux/fs.h	2010-12-26 20:59:50.000000000 +0800
> +++ mmotm/include/linux/fs.h	2010-12-26 21:00:09.000000000 +0800
> @@ -800,6 +800,8 @@ struct inode {
>  	struct posix_acl	*i_default_acl;
>  #endif
>  	void			*i_private; /* fs or device private pointer */
> +
> +	char			i_comm[16]; /* first opened by */
>  };
>  
>  static inline int inode_unhashed(struct inode *inode)

Wu Fengguang

2011-Jan-20 06:06 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, Jan 20, 2011 at 01:44:57PM +0800, Li, Shaohua
wrote:> On Thu, 2011-01-20 at 12:41 +0800, Dave Chinner wrote:
> > On Wed, Jan 19, 2011 at 08:10:14PM -0800, Andrew Morton wrote:
> > > On Thu, 20 Jan 2011 11:21:49 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > > 
> > > > > It seems to return a single offset/length tuple which
refers to the
> > > > > btrfs metadata "file", with the intent that
this tuple later be fed
> > > > > into a btrfs-specific readahead ioctl.
> > > > > 
> > > > > I can see how this might be used with say fatfs or ext3
where all
> > > > > metadata resides within the blockdev address_space. 
But how is a
> > > > > filesytem which keeps its metadata in multiple
address_spaces supposed
> > > > > to use this interface?
> > > > Oh, this looks like a big problem, thanks for letting me
know such
> > > > filesystems. is it possible specific filesystem mapping
multiple
> > > > address_space ranges to a virtual big ranges? the new ioctls
handle the
> > > > mapping.
> > > 
> > > I''m not sure what you mean by that.
> > > 
> > > ext2, minix and probably others create an address_space for each
> > > directory.  Heaven knows what xfs does (for example).
> > 
> > In 2.6.39 it won''t even use address spaces for metadata
caching.
> > 
> > Besides, XFS already has pretty sophisticated metadata readahead
> > built in - it''s one of the reasons why the XFS directory code
scales
> > so well on cold cache lookups of arge directories - so I
don''t see
> > much need for such an interface for XFS.
> > 
> > Perhaps btrfs would be better served by implementing speculative
> > metadata readahead in the places where it makes sense (e.g. readdir)
> > bcause it will improve cold-cache performance on a much wider range
> > of workloads than at just boot-time....
> I don''t know about xfs. A sophisticated metadata readahead might
make
> metadata async, but I thought it''s impossible it can removes the
disk
> seek. Since metadata and data usually lives in different disk block
> ranges, doing data readahead will unavoidable read metadata and cause
> disk seek between reading data and metadata.
It''s standard practice to do in-kernel heuristic readahead for large
directories.  It''s irrelevant to data/metadata interleaving.

It''s exactly interleaved reads that makes readahead a must-have.
Think about interleavingly reading 2+ large files :)

Thanks,
Fengguang

Shaohua Li

2011-Jan-20 06:12 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 2011-01-20 at 13:55 +0800, Andrew Morton wrote:> On Thu, 20 Jan 2011 13:38:18 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> 
> > > ext2, minix and probably others create an address_space for each
> > > directory.  Heaven knows what xfs does (for example).
> > yes, this is for one directiory, but the all files''s metadata
are in
> > block_dev address_space.
> > I thought you mean there are several block_dev address_space like
> > address_space in some filesystems, which doesn''t fit well in
my
> > implementation. for ext like filesystem, there is only one
> > address_space. for filesystems with several address_space, my proposal
> > is map them to a virtual big address_space in the new ioctls.
> 
> ext2 and minixfs (and I think sysv and ufs) have a separate
> address_space for each directory.  I don''t see how those can be
> represented with a single "virtual big address_space" - we also
need
> identifiers in there so each directory''s address_space can be
created
> and appropriately populated.Oh, I misunderstand your comments. you are right, the ioctl methods
don''t work for ext2. the dir''s address_space can''t be
readahead either.
Looks we could only do the metadata readahead in filesystem specific
way.

Thanks,
Shaohua

Wu Fengguang

2011-Jan-20 06:19 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, Jan 20, 2011 at 02:12:33PM +0800, Li, Shaohua
wrote:> On Thu, 2011-01-20 at 13:55 +0800, Andrew Morton wrote:
> > On Thu, 20 Jan 2011 13:38:18 +0800 Shaohua Li
<shaohua.li@intel.com> wrote:
> > 
> > > > ext2, minix and probably others create an address_space for
each
> > > > directory.  Heaven knows what xfs does (for example).
> > > yes, this is for one directiory, but the all files''s
metadata are in
> > > block_dev address_space.
> > > I thought you mean there are several block_dev address_space like
> > > address_space in some filesystems, which doesn''t fit
well in my
> > > implementation. for ext like filesystem, there is only one
> > > address_space. for filesystems with several address_space, my
proposal
> > > is map them to a virtual big address_space in the new ioctls.
> > 
> > ext2 and minixfs (and I think sysv and ufs) have a separate
> > address_space for each directory.  I don''t see how those can
be
> > represented with a single "virtual big address_space" - we
also need
> > identifiers in there so each directory''s address_space can be
created
> > and appropriately populated.
> Oh, I misunderstand your comments. you are right, the ioctl methods
> don''t work for ext2. the dir''s address_space
can''t be readahead either.
> Looks we could only do the metadata readahead in filesystem specific
> way.
There should be little interest on ext2 boot time optimization.

However if necessary, we might somehow treat ext2 dirs as files and
do normal fadvise on them? The other ext2 metadata may still be
accessible via the block_dev interface.

Thanks,
Fengguang
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2011-Jan-20 06:27 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 20 Jan 2011 14:12:33 +0800 Shaohua Li <shaohua.li@intel.com>
wrote:
> On Thu, 2011-01-20 at 13:55 +0800, Andrew Morton wrote:
> > On Thu, 20 Jan 2011 13:38:18 +0800 Shaohua Li
<shaohua.li@intel.com> wrote:
> > 
> > > > ext2, minix and probably others create an address_space for
each
> > > > directory.  Heaven knows what xfs does (for example).
> > > yes, this is for one directiory, but the all files''s
metadata are in
> > > block_dev address_space.
> > > I thought you mean there are several block_dev address_space like
> > > address_space in some filesystems, which doesn''t fit
well in my
> > > implementation. for ext like filesystem, there is only one
> > > address_space. for filesystems with several address_space, my
proposal
> > > is map them to a virtual big address_space in the new ioctls.
> > 
> > ext2 and minixfs (and I think sysv and ufs) have a separate
> > address_space for each directory.  I don''t see how those can
be
> > represented with a single "virtual big address_space" - we
also need
> > identifiers in there so each directory''s address_space can be
created
> > and appropriately populated.
> Oh, I misunderstand your comments. you are right, the ioctl methods
> don''t work for ext2. the dir''s address_space
can''t be readahead either.
> Looks we could only do the metadata readahead in filesystem specific
> way.
Another way of doing all this would be to implement some sort of
lookaside cache at the vfs->block boundary.  At boot time, load that
cache up with all the disk blocks which we know the boot will need (a
single ascending pass across the disk) and then when the vfs/fs goes to
read a disk block take a peek in that cache first and if it''s a hit,
either steal the page or memcpy it.

It has the obvious coherence problems which would be pretty simple to
solve by hooking into the block write path as well.  The list of needed
blocks can be very simply generated with existing blktrace
infrastructure.  It does add permanent runtime overhead - once the
cache is invalidated and disabled, every IO operation would incur a
test-n-not-taken-branch.  Maybe not too bad.

Need to handle small-memory systems somehow, where the cache simply
ooms the machine or becomes ineffective because it''s causing eviction
elsewhere.

It could perhaps all be implemented as an md or dm driver.

Or even as an IO scheduler.  I say this because IO schedulers can be
replaced on-the-fly, so the caching layer can be unloaded from the
stack once it is finished with.
--
To unsubscribe from this list: send the line "unsubscribe
linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Andrew Morton

2011-Jan-20 06:29 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 20 Jan 2011 14:19:50 +0800 Wu Fengguang
<fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> On Thu, Jan 20, 2011 at 02:12:33PM +0800, Li, Shaohua wrote:
> > On Thu, 2011-01-20 at 13:55 +0800, Andrew Morton wrote:
> > > On Thu, 20 Jan 2011 13:38:18 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > > 
> > > > > ext2, minix and probably others create an address_space
for each
> > > > > directory.  Heaven knows what xfs does (for example).
> > > > yes, this is for one directiory, but the all
files''s metadata are in
> > > > block_dev address_space.
> > > > I thought you mean there are several block_dev address_space
like
> > > > address_space in some filesystems, which doesn''t
fit well in my
> > > > implementation. for ext like filesystem, there is only one
> > > > address_space. for filesystems with several address_space,
my proposal
> > > > is map them to a virtual big address_space in the new
ioctls.
> > > 
> > > ext2 and minixfs (and I think sysv and ufs) have a separate
> > > address_space for each directory.  I don''t see how those
can be
> > > represented with a single "virtual big address_space" -
we also need
> > > identifiers in there so each directory''s address_space
can be created
> > > and appropriately populated.
> > Oh, I misunderstand your comments. you are right, the ioctl methods
> > don''t work for ext2. the dir''s address_space
can''t be readahead either.
> > Looks we could only do the metadata readahead in filesystem specific
> > way.
> 
> There should be little interest on ext2 boot time optimization.
We''re discussing the userspace interface design here.  If that design
is unsuitable for ext2, minixfs, sysvfs and udf then it''s not the right
design!

Shaohua Li

2011-Jan-20 06:37 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, 2011-01-20 at 14:19 +0800, Wu, Fengguang wrote:> On Thu, Jan 20, 2011 at 02:12:33PM +0800, Li, Shaohua wrote:
> > On Thu, 2011-01-20 at 13:55 +0800, Andrew Morton wrote:
> > > On Thu, 20 Jan 2011 13:38:18 +0800 Shaohua Li
<shaohua.li@intel.com> wrote:
> > > 
> > > > > ext2, minix and probably others create an address_space
for each
> > > > > directory.  Heaven knows what xfs does (for example).
> > > > yes, this is for one directiory, but the all
files''s metadata are in
> > > > block_dev address_space.
> > > > I thought you mean there are several block_dev address_space
like
> > > > address_space in some filesystems, which doesn''t
fit well in my
> > > > implementation. for ext like filesystem, there is only one
> > > > address_space. for filesystems with several address_space,
my proposal
> > > > is map them to a virtual big address_space in the new
ioctls.
> > > 
> > > ext2 and minixfs (and I think sysv and ufs) have a separate
> > > address_space for each directory.  I don''t see how those
can be
> > > represented with a single "virtual big address_space" -
we also need
> > > identifiers in there so each directory''s address_space
can be created
> > > and appropriately populated.
> > Oh, I misunderstand your comments. you are right, the ioctl methods
> > don''t work for ext2. the dir''s address_space
can''t be readahead either.
> > Looks we could only do the metadata readahead in filesystem specific
> > way.
> 
> There should be little interest on ext2 boot time optimization.
> 
> However if necessary, we might somehow treat ext2 dirs as files and
> do normal fadvise on them? The other ext2 metadata may still be
> accessible via the block_dev interface.current readahead syscall might already work here. however, I would
expect there is stall easily when reading dirs.
thinking 3 dirs:
/
/aa
/aa/bb
before / is in memory, reading aa will have stall. And we can''t reduce
disk seeks here. metadata readahead will still have some benefits but
might not be much in such filesystem.

Thanks,
Shaohua

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Wu Fengguang

2011-Jan-20 06:45 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, Jan 20, 2011 at 02:37:37PM +0800, Li, Shaohua
wrote:> On Thu, 2011-01-20 at 14:19 +0800, Wu, Fengguang wrote:
> > On Thu, Jan 20, 2011 at 02:12:33PM +0800, Li, Shaohua wrote:
> > > On Thu, 2011-01-20 at 13:55 +0800, Andrew Morton wrote:
> > > > On Thu, 20 Jan 2011 13:38:18 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > > > 
> > > > > > ext2, minix and probably others create an
address_space for each
> > > > > > directory.  Heaven knows what xfs does (for
example).
> > > > > yes, this is for one directiory, but the all
files''s metadata are in
> > > > > block_dev address_space.
> > > > > I thought you mean there are several block_dev
address_space like
> > > > > address_space in some filesystems, which
doesn''t fit well in my
> > > > > implementation. for ext like filesystem, there is only
one
> > > > > address_space. for filesystems with several
address_space, my proposal
> > > > > is map them to a virtual big address_space in the new
ioctls.
> > > > 
> > > > ext2 and minixfs (and I think sysv and ufs) have a separate
> > > > address_space for each directory.  I don''t see how
those can be
> > > > represented with a single "virtual big
address_space" - we also need
> > > > identifiers in there so each directory''s
address_space can be created
> > > > and appropriately populated.
> > > Oh, I misunderstand your comments. you are right, the ioctl
methods
> > > don''t work for ext2. the dir''s address_space
can''t be readahead either.
> > > Looks we could only do the metadata readahead in filesystem
specific
> > > way.
> > 
> > There should be little interest on ext2 boot time optimization.
> > 
> > However if necessary, we might somehow treat ext2 dirs as files and
> > do normal fadvise on them? The other ext2 metadata may still be
> > accessible via the block_dev interface.
> current readahead syscall might already work here. however, I would
> expect there is stall easily when reading dirs.
> thinking 3 dirs:
> /
> /aa
> /aa/bb
> before / is in memory, reading aa will have stall. And we can''t
reduce
> disk seeks here. metadata readahead will still have some benefits but
> might not be much in such filesystem.
Yeah, good point.

Thanks,
Fengguang

Dave Chinner

2011-Jan-24 04:29 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On Thu, Jan 20, 2011 at 01:44:57PM +0800, Shaohua Li
wrote:> On Thu, 2011-01-20 at 12:41 +0800, Dave Chinner wrote:
> > On Wed, Jan 19, 2011 at 08:10:14PM -0800, Andrew Morton wrote:
> > > On Thu, 20 Jan 2011 11:21:49 +0800 Shaohua Li
<shaohua.li-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> > > 
> > > > > It seems to return a single offset/length tuple which
refers to the
> > > > > btrfs metadata "file", with the intent that
this tuple later be fed
> > > > > into a btrfs-specific readahead ioctl.
> > > > > 
> > > > > I can see how this might be used with say fatfs or ext3
where all
> > > > > metadata resides within the blockdev address_space. 
But how is a
> > > > > filesytem which keeps its metadata in multiple
address_spaces supposed
> > > > > to use this interface?
> > > > Oh, this looks like a big problem, thanks for letting me
know such
> > > > filesystems. is it possible specific filesystem mapping
multiple
> > > > address_space ranges to a virtual big ranges? the new ioctls
handle the
> > > > mapping.
> > > 
> > > I''m not sure what you mean by that.
> > > 
> > > ext2, minix and probably others create an address_space for each
> > > directory.  Heaven knows what xfs does (for example).
> > 
> > In 2.6.39 it won''t even use address spaces for metadata
caching.
> > 
> > Besides, XFS already has pretty sophisticated metadata readahead
> > built in - it''s one of the reasons why the XFS directory code
scales
> > so well on cold cache lookups of arge directories - so I
don''t see
> > much need for such an interface for XFS.
> > 
> > Perhaps btrfs would be better served by implementing speculative
> > metadata readahead in the places where it makes sense (e.g. readdir)
> > bcause it will improve cold-cache performance on a much wider range
> > of workloads than at just boot-time....
> I don''t know about xfs. A sophisticated metadata readahead might
make
> metadata async, but I thought it''s impossible it can removes the
disk
> seek.
Nothing you do will remove the disk seek. What readahead is supposed
to do is  _minimise the latency_ of the disk seek.
> Since metadata and data usually lives in different disk block
> ranges, doing data readahead will unavoidable read metadata and cause
> disk seek between reading data and metadata.
Which comes back to how well the filesystem lays out the metadata
related to the data that needs to be read. In the case of XFS, the
metadata it needs is already in the inode, so once the inodes are
read into memory, there is no extra metadata seeks between data
seeks.

That is, if you are using XFS all you need to do in terms of
metadata readahead is stat every file needed by the boot process.
The optimal order for doing this is simply by ordering them in
ascending inode number. IOWs, the problem can be optimised without
any special kernel interfaces to do metadata readahead, especially
if you multithread the stat() walk to avoid blocking on IO that
metadata readahead hasn''t already brought into cache....

IIRC, btrfs tends to keep all it''s per-inode metadata close together
like XFS does, so it should be read at the same time the inode is
read.

Indeed, the dependencies of readahead are pretty well understood.  A
demonstration of optimising reading of file data across a complex
directory heirarchy is well deomonstrated by this little tool from
Chris Mason:

http://oss.oracle.com/~mason/acp/

I suspect that applying such a technique to the problem of optimising
boot-time IO pattern with net you the same gains as this new kernel
API will. And it will do it in a manner that is filesystem
agnostic...

Cheers,

Dave.
-- 
Dave Chinner
david-FqsqvQoI3Ljby3iVrkZq2A@public.gmane.org

Boaz Harrosh

2011-Jan-24 10:06 UTC

head link

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

On 01/20/2011 08:27 AM, Andrew Morton wrote:> 
> Another way of doing all this would be to implement some sort of
> lookaside cache at the vfs->block boundary.  At boot time, load that
> cache up with all the disk blocks which we know the boot will need (a
> single ascending pass across the disk) and then when the vfs/fs goes to
> read a disk block take a peek in that cache first and if it''s a
hit,
> either steal the page or memcpy it.
> 
Ha. this sounds very much like the cleancache project presented for
inclusion so many times. It has even visited and left linux-next
a few times. They solved all these problems with a few VFS hooks.
> It has the obvious coherence problems which would be pretty simple to
> solve by hooking into the block write path as well.  
See cleancache they solved it with a simple VFS hook.
> The list of needed
> blocks can be very simply generated with existing blktrace
> infrastructure.  It does add permanent runtime overhead - once the
> cache is invalidated and disabled, every IO operation would incur a
> test-n-not-taken-branch.  Maybe not too bad.
> 
> Need to handle small-memory systems somehow, where the cache simply
> ooms the machine or becomes ineffective because it''s causing
eviction
> elsewhere.
> 
> It could perhaps all be implemented as an md or dm driver.
> 
> Or even as an IO scheduler.  I say this because IO schedulers can be
> replaced on-the-fly, so the caching layer can be unloaded from the
> stack once it is finished with.
Or a cleancache driver

Boaz

Btrfs devel - Jan 2011 - [PATCH v3 1/5] add metadata_incore ioctl in vfs

[PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs

Re: [PATCH v3 1/5] add metadata_incore ioctl in vfs