[PATCH -V18 09/13] x86: Add new syscalls for x86_64

Previous thread: linux-next: build warning after merge of the input tree by Stephen Rothwell on Thursday, August 19, 2010 - 6:33 pm. (1 message)

Next thread: [PATCH 1/4] regulator/wm831x-dcdc: fix potential NULL dereference by Axel Lin on Thursday, August 19, 2010 - 6:58 pm. (6 messages)
From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

Hi,

The below set of patches implement open by handle support using exportfs
operations. This allows user space application to map a file name to file 
handle and later open the file using handle. This should be usable
for userspace NFS [1] and 9P server [2]. XFS already support this with the ioctls
XFS_IOC_PATH_TO_HANDLE and XFS_IOC_OPEN_BY_HANDLE.

[1] http://nfs-ganesha.sourceforge.net/
[2] http://thread.gmane.org/gmane.comp.emulators.qemu/68992

git repo for the patchset at:

git://git.kernel.org/pub/scm/linux/kernel/git/kvaneesh/linux-open-handle.git open-by-handle

Changes from V17:
a) Add syscall number to asm-generic/unistd.h
c) Fix wrong put_filep usage in do_handle_open

Changes from V16:
a) mnt_id is now an arg to syscall
b) finish_open_handle is removed. Update finish_open to work with
   struct path and struct file.
c) Add O_NOACCESS and use that for open_by_handle on symlink

Changes from V15:
a) Add mount id to file handle
b) Add support for optional uuid tag in /proc/<pid>/mountinfo
c) Added MAY_OPEN_LINK to support open on symlink instead of
   adding a new may_open function. Also limited the open flag
   to O_RDONLY.

Changes from V14:
a) Use fget_light instead of fget in the patches
b) Drop uuid from struct file_handle as per the last review

Changes from V13:
a) Add support for file descriptor to handle conversion. This is needed
   so that we find the right file handle for newly created files.
   
Changes from V12:
a) Use CAP_DAC_READ_SEARCH instead of CAP_DAC_OVERRIDE in open_by_handle
b) Return -ENOTDIR if O_DIRECTORY flag is specified in open_by_handle with
   handle for non directory

Changes from V11:
a) Add necessary documentation to different functions
b) Add null pathname support to faccessat and linkat similar to
   readlinkat.
c) compile fix on x86_64

Changes from V10:
a) Missed an stg refresh before sending out the patchset. Send
   updated patchset.

Changes from V9:
a) Fix compile errors with CONFIG_EXPORTFS not ...
From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

The exportfs encode handle function should return the minimum required
handle size. This helps user to find out the handle size by passing 0
handle size in the first step and then redoing to the call again with
the returned handle size value.

Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/btrfs/export.c             |    8 ++++++--
 fs/exportfs/expfs.c           |    9 +++++++--
 fs/fat/inode.c                |    4 +++-
 fs/fuse/inode.c               |    4 +++-
 fs/gfs2/export.c              |    8 ++++++--
 fs/isofs/export.c             |    8 ++++++--
 fs/ocfs2/export.c             |    8 +++++++-
 fs/reiserfs/inode.c           |    7 ++++++-
 fs/udf/namei.c                |    7 ++++++-
 fs/xfs/linux-2.6/xfs_export.c |    4 +++-
 include/linux/exportfs.h      |    6 ++++--
 mm/shmem.c                    |    4 +++-
 12 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09..5f8ee5a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	int len = *max_len;
 	int type;
 
-	if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
-	    (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+	if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
 		return 255;
+	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+		return 255;
+	}
 
 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
 	type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e1759..cfee0f0 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -319,9 +319,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
 	struct inode * inode = dentry->d_inode;
 	int len = *max_len;
 	int type = FILEID_INO32_GEN;
-	
-	if (len < 2 || (connectable && len ...
From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/exportfs/expfs.c      |    2 +
 fs/namei.c               |  220 +++++++++++++++++++++++++++++++++++++++++++---
 fs/open.c                |   32 ++++++-
 include/linux/fs.h       |    8 ++-
 include/linux/namei.h    |    1 +
 include/linux/syscalls.h |    3 +
 6 files changed, 247 insertions(+), 19 deletions(-)

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cfee0f0..05a1179 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -373,6 +373,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
+	if (!nop || !nop->fh_to_dentry)
+		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
 	if (!result)
 		result = ERR_PTR(-ESTALE);
diff --git a/fs/namei.c b/fs/namei.c
index 17ea76b..1291dfc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
 #include <linux/fs_struct.h>
+#include <linux/exportfs.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -1042,6 +1043,29 @@ out_fail:
 	return retval;
 }
 
+struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+	int fput_needed;
+	struct path *path;
+	struct file *filep;
+
+	if (fd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+		read_lock(&fs->lock);
+		path = &fs->pwd;
+		mntget(path->mnt);
+		read_unlock(&fs->lock);
+	} else {
+		filep = fget_light(fd, &fput_needed);
+		if (!filep)
+			return ERR_PTR(-EBADF);
+		path = &filep->f_path;
+		mntget(path->mnt);
+		fput_light(filep, fput_needed);
+	}
+	return path->mnt;
+}
+
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int do_path_lookup(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
@@ -1546,26 +1570,30 @@ static int open_will_truncate(int flag, struct inode *inode)
 	return (flag & ...
From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

The patch update may_open to allow handle based open on symlinks.
The file handle based API use file descritor returned from open_by_handle_at
to do different file system operations. To find the link target name we
need to get a file descriptor on symlinks.

We should be able to read the link target using file handle. The exact
usecase is with respect to implementing READLINK operation on a
userspace NFS server. The request contain the file handle and the
response include target name.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/namei.c                  |   24 ++++++++++++++++++++++--
 include/asm-generic/fcntl.h |    1 +
 include/linux/fs.h          |    1 +
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1291dfc..2c79363 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1447,7 +1447,20 @@ int may_open(struct path *path, int acc_mode, int flag)
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFLNK:
-		return -ELOOP;
+		/*
+		 * Allow only if acc_mode contain
+		 * open link request.
+		 */
+		if (!(acc_mode & MAY_OPEN_LINK))
+			return -ELOOP;
+		/*
+		 * Allow open on symlink only with
+		 * open flag O_NOACCESS. We don't
+		 * allow read/write on symlinks
+		 */
+		if (flag != O_NOACCESS)
+			return -ELOOP;
+		break;
 	case S_IFDIR:
 		if (acc_mode & MAY_WRITE)
 			return -EISDIR;
@@ -2038,8 +2051,15 @@ long do_handle_open(int mountdirfd,
 	 */
 	if (open_flag & __O_SYNC)
 		open_flag |= O_DSYNC;
+	/*
+	 * Handle based API allow open on a symlink
+	 */
+	if (S_ISLNK(path->dentry->d_inode->i_mode))
+		acc_mode = MAY_OPEN_LINK;
+	else
+		acc_mode = MAY_OPEN;
 
-	acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+	acc_mode |= ACC_MODE(open_flag);
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (open_flag & O_TRUNC)
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index a70b2d2..9d8f455 100644
--- a/include/asm-generic/fcntl.h
+++ ...
From: Aneesh Kumar K. V
Date: Thursday, August 19, 2010 - 7:13 pm

If we don't do this, we will need new syscalls for doing stat, readlink
and link on symlinks that take file handle as the argument. If that is ok,
I can redo the patch series accordingly.

-aneesh
--

From: Aneesh Kumar K. V
Date: Thursday, August 19, 2010 - 11:53 pm

This ended up with the below new syscalls

sys_handle_readlink, sys_handle_stat64, sys_handle_link, sys_handle_chown,
sys_handle_setxattr, sys_handle_getxattr, sys_handle_listxattr,
sys_handle_removexattr, sys_handle_utimes, compat_sys_handle_utimes

-aneesh
--

From: Christoph Hellwig
Date: Friday, August 20, 2010 - 1:30 am

Suddenly getting an file pointer for a symlink which could never happen
before is a really bad idea.  Just add a proper readlink_by_handle
system call, similar to what's done in the XFS interface.

--

From: Neil Brown
Date: Friday, August 20, 2010 - 2:53 am

On Fri, 20 Aug 2010 04:30:57 -0400

Why is that?
With futexes we suddenly get a file descriptor for something we could never
get a file descriptor on before and that doesn't seem to be a problem.

Why should symlinks be special as the only thing that you cannot have a file
descriptor for?  Uniformity of interface is a very valuable property.

NeilBrown
--

From: Al Viro
Date: Friday, August 20, 2010 - 4:51 am

You are welcome to review the codepaths around pathname resolution for
assumptions of presense of ->follow_link() and friends; there _are_
subtle cases and dumping your "opened symlinks" in there is far from
a trivial change.  Note that it affects more than just the starting
points of lookups; /proc/*/fd/* stuff is also involved.

BTW, speaking of NULL pathname, linkat() variant that allows creating a link
to an opened file is also a very dubious thing; at the very least, you get
non-trivial security implications, since now a process that got an opened
descriptor passed to it by somebody else may create hardlinks to the sucker.
--

From: Neil Brown
Date: Friday, August 20, 2010 - 5:09 pm

[[email address for Nick Piggin changed to npiggin@kernel.dk]]

On Fri, 20 Aug 2010 12:51:35 +0100

So as I understand it you aren't rejecting the concept in principle, but you
believe non-trivial code review is required before it can be considered an
acceptable change?

Fair comment, and while one could imagine ways around this (such as requiring
some Capability to link an fd) they wouldn't be very elegant.
But then nor is inventing a pile of new syscalls for doing different things
with handles such as the list Aneesh posted.

Maybe a different approach is needed.

How about a new AT flag:  AT_FILE_HANDLE

Meaning is that the 'dirfd' is used only to identify a filesystem (vfsmnt) and
the 'name' pointer actually points to a filehandle fragment interpreted in
that filesystem.

One problem is that there is no way to pass the length...
Options:
   fragment is at most 64 bytes nul padded at the end
   fragment is hex encoded and nul terminated
   ??

I think I prefer the hex encoding, but I'm hoping someone else has a better
idea.

NeilBrown
--

From: Nick Piggin
Date: Saturday, August 21, 2010 - 1:30 am

Thanks, I had both of the same concerns as Christoph with API
change and exposing symlink fds last time I looked at the patces,
actually.

But they can probably be worked around or avoided. I think the more
important thing is whether it is worth supporting. This is
all restricted to root (or CAP_DAC_READ_SEARCH) only, right, and
what exact semantics they want. I would like to see more discussion
of what this enables and some results.

For the case of avoiding expensive network revalidations in path name
lookup, do we even need to open symlinks? Could the security issues be
avoided by always having handle attached to an open fd?

--

From: Aneesh Kumar K. V
Date: Saturday, August 21, 2010 - 2:42 am

For implementing a userspace file server that use handle for
representing files (like NFS) we would require to have the ability to do
different file system operations that can operate on symlink to work on
handle too. 

-aneesh

--

From: Aneesh Kumar K. V
Date: Saturday, August 21, 2010 - 7:02 pm

linkat change to take NULL path name is needed for regular files
also. The file server will get a request to create a link with handle
of oldpath and directory handle of new path and new name.

The changes would also help in the implementation of 9P server in Qemu
for implementing a fast virtualization passthrough file system (aka
VirtFS)

http://thread.gmane.org/gmane.comp.emulators.qemu/68992

-aneesh
--

From: Nick Piggin
Date: Tuesday, August 24, 2010 - 12:21 am

Right. Is this a really important goal, I'm wondering? Is it realistic
(ie. to be able to remove the nfs server from the kernel)?

--

From: Aneesh Kumar K. V
Date: Tuesday, August 24, 2010 - 3:34 am

The feature is also needed to implement a 9p virtio pass through file
system in Qemu http://thread.gmane.org/gmane.comp.emulators.qemu/68992

-aneesh

--

From: J. Bruce Fields
Date: Tuesday, August 24, 2010 - 6:19 am

I think so.

The hardest part is if you want to export a filesystem that also has
non-nfs users--in the kernel it's much easier to maintain consistency
between local users of the filesystem and nfs clients.

If the server has no local users, then userspace may turn out to be
easier.

In that case you might ask why the server needs a real filesystem to
export at all--in theory it could maintain all its data on its own.  But
I suspect if you try that you find you're reinventing a lot of wheels.

--b.
--

From: Neil Brown
Date: Sunday, August 22, 2010 - 4:17 pm

On Sat, 21 Aug 2010 18:30:24 +1000

They allow a credible user-space implementation of the server for some

I don't see what you are getting at here... which particular security isses,
and what fd would you use?

As I understand it there are only two security issues that have been noted.
1/ lookup-by-filehandle can bypass any 'search' permission tests on ancestor
   directories.  I cannot see any way to avoid this except require
   CAP_DAC_READ_SEARCH
2/ Creating a hardlink to an 'fd'  allows a process that was given an 'fd'
   that it could not have opened itself to prevent that file from being
   removed (and space reclaimed) by creating a private hardlink.
   This could be avoided by requiring CAP_DAC_READ_SEARCH for that particular
   operation (and probably requiring i_nlink > 0 anyway) but that feels like
   a very special-case restriction.

Was it one of these that you were referring to?

Thanks,

--

From: Aneesh Kumar K. V
Date: Saturday, August 21, 2010 - 2:31 am

Why would that be an issue ? Even though a hardlink can be created,
only process with right privileges can access them. One problem is;
being able to create hardlinks at different directory location as above
implies that there is no way to guarantee that the file got completely
removed from the system. Is this the security risk that you are pointing







-aneesh
--

From: Aneesh Kumar K. V
Date: Saturday, August 21, 2010 - 2:32 am

struct file_handle already include length

-aneesh

--

From: Neil Brown
Date: Sunday, August 22, 2010 - 4:06 pm

On Sat, 21 Aug 2010 01:13:52 -0600

We already parse filenames into components separated by '/'.  Is HEX decoding
that much more ugly.

Filehandles are currently passed between the kernel and mountd as HEX

That part of a filehandle that nfsd gives to the filesystem is one byte out
of a 4-byte header, plus the tail of the filehandle after the part that
identifies the filesystem.
This 'one byte' does imply the length, but it doesn't necessarily encode it.
Rather it is a 'type'.  So it cannot really be used to determine the length
at the point when the filehandle would need to be copied from userspace into
the kernel.


I don't think there is any precedent for passing a 4-byte length followed by
a binary string, while there is plenty of precedent for passing a
nul-terminated ASCII string.

[[ Following this approach I would like to avoid any filehandle-specific
syscalls altogether.
Just use a *at syscall with AT_FILE_HANDLE for filehandle lookup, and use
getxattr('system:linux.file_handle') to get the filehandle for a given path.

Ofcourse we would need to at *at versions of the *xattr syscalls, but that is
probably a good idea anyway.
]]


--

From: Aneesh Kumar K. V
Date: Sunday, August 22, 2010 - 6:24 pm

There are at* syscalls that doesn't take the additional flags as the
argument, like openat, readlinkat. How will handle based open and
readlink work with the above interface ?

-aneesh

--

From: Neil Brown
Date: Sunday, August 22, 2010 - 6:52 pm

On Mon, 23 Aug 2010 06:54:03 +0530

Bother... you are right.

I had remembered that at the time that all that *at calls were added there was
discussion about how you always need some flags, particularly in the context
of adding O_CLOEXEC and (I thought) a flag to allow non-sequential allocation
of fds.
I had thought that they all got 'flags' arguments as a result, but it seems
not.
For openat you could squeeze something into the current 'flags' arg
(O_FILE_HANDLE), but for readlinkat, symlinkat at least there is no such
option.  Sad really.

NeilBrown
--

From: Aneesh Kumar K. V
Date: Tuesday, August 24, 2010 - 3:40 am

IMHO that is really bad overloading of flags value. I also find  hex
encoded handle in pathname argument of *at syscalls confusing. The above
discussion also hint that we would need a new syscall for open,
readlink and setxattr, So how about me posting the new series which
remove open on symlink patch and add a bunch of syscalls to allow
operation on symlinks based on handle ?

-aneesh



--

From: Aneesh Kumar K. V
Date: Sunday, August 22, 2010 - 7:49 pm

sys_setxattrat would take 7 arguments.

-aneesh
--

From: Neil Brown
Date: Tuesday, August 24, 2010 - 7:06 pm

On Mon, 23 Aug 2010 08:19:48 +0530

I count 6 - assuming we share the 'flags' argument.  There are only 2 xattr
flags currently so there should be room to share.

  dirfd  path name value size flags == 6

NeilBrown
--

From: Bastien ROUCARIES
Date: Tuesday, August 24, 2010 - 2:41 am

Why ot creating a special file system for this kind of operation ?
I mean a vfsmnt filesystem, with each directory on the root is a
symlink to the root of the real vfsmnt root ?

I could be even be in proc space like /proc/self/vfsmnt

path_to_handle will return a relative path from this directory like
0x75843558/somehandle (if X is on /usr/bin/X and usr is mounted by
filesystem 0x75843558)
path_to_fshandle() will return 0x75843558

opening file handle will be just a matter to thus open
/proc/self/vfsmount/0x75843558/somehandle

Permission will be determined by vfsmount filesystem.

No need to create new syscall all te handle to filename will be handle
by the vfsmount filesystem

We could even use at existing command. The dirfd will need to be only
/proc/self/vfsmnt (and if you need to get a fd without mounting /proc
create a syscall to get this fd).

Does sound plausible ?

Bastien
--

From: Neil Brown
Date: Tuesday, August 24, 2010 - 7:04 pm

On Tue, 24 Aug 2010 11:41:10 +0200

I don't think so.

I'm not 100% certain what you are proposing, but I think the basic idea is a
virtual filesystem where giving a textual filehandle as a name gives access
to the file with that filehandle.

This could only work by creating a virtual symlink from the name to the
object in whichever filesystem - somewhat like /proc/self/fd/*.
This could be used to open the file, not to create a hard-link or read a
symlink which are two of the issues we are struggling with.

Maybe I have misunderstood you though.

NeilBrown
--

From: Bastien ROUCARIES
Date: Wednesday, August 25, 2010 - 2:13 am

This issue could be raised by the open O_NODE patch
(http://lwn.net/Articles/364735/).
For symlink it will allow to read the symlink.

For hard link some bsd have fsdb that allow this kind of stuff. in our
case the security implication are huge, We could undelete a removed
file. In fact you need a flink(fd, "new path"). I could be useful in
some context note but seriously restricted.

Bastien
--

From: Peter Zijlstra
Date: Friday, August 20, 2010 - 6:25 am

FWIW

commit 82af7aca56c67061420d618cc5a30f0fd4106b80
Author: Eric Sesterhenn <snakebyte@gmx.de>
Date:   Fri Jan 25 10:40:46 2008 +0100

    Removal of FUTEX_FD

--

From: Neil Brown
Date: Friday, August 20, 2010 - 4:47 pm

On Fri, 20 Aug 2010 15:25:29 +0200

Interesting - thanks.

OK, adjust my argument to reference signalfd, timerfs and pollfd.

I haven't looked at that code before - "anon_inode_getfd" gives you an fd on
an inode for which 
    (stat.st_mode & S_IFMT) == 0
Cool!  They have no format at all :-)

NeilBrown
--

From: Aneesh Kumar K. V
Date: Friday, August 20, 2010 - 7:38 am

It is not just readlink that we would need. We would require

   sys_handle_readlink, sys_handle_stat64, sys_handle_link, sys_handle_chown,
   sys_handle_setxattr, sys_handle_getxattr, sys_handle_listxattr,
   sys_handle_removexattr, sys_handle_utimes, compat_sys_handle_utimes

I have most of the changes done, so if we are ok adding all these
new syscalls i can post the patch series.

-aneesh

--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

From: NeilBrown <neilb@suse.de>

This enables to use readlink to get the link target name
from a file descriptor point to the link. This can be used
with open_by_handle syscall that returns a file descriptor for a link.
We can then use this file descriptor to get the target name.

This is similar to utimensat(2) interface

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/stat.c |   30 ++++++++++++++++++++++--------
 1 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/fs/stat.c b/fs/stat.c
index 12e90e2..3723a9f 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -289,26 +289,40 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
 		char __user *, buf, int, bufsiz)
 {
-	struct path path;
-	int error;
+	int error = 0, fput_needed;
+	struct path path, *pp;
+	struct file *file = NULL;
 
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = user_path_at(dfd, pathname, 0, &path);
+	if (pathname == NULL && dfd != AT_FDCWD) {
+		file = fget_light(dfd, &fput_needed);
+
+		if (file)
+			pp = &file->f_path;
+		else
+			error = -EBADF;
+	} else {
+		error = user_path_at(dfd, pathname, 0, &path);
+		pp = &path;
+	}
 	if (!error) {
-		struct inode *inode = path.dentry->d_inode;
+		struct inode *inode = pp->dentry->d_inode;
 
 		error = -EINVAL;
 		if (inode->i_op->readlink) {
-			error = security_inode_readlink(path.dentry);
+			error = security_inode_readlink(pp->dentry);
 			if (!error) {
-				touch_atime(path.mnt, path.dentry);
-				error = inode->i_op->readlink(path.dentry,
+				touch_atime(pp->mnt, pp->dentry);
+				error = inode->i_op->readlink(pp->dentry,
 							      buf, bufsiz);
 			}
 		}
-		path_put(&path);
+		if (file)
+			fput_light(file, fput_needed);
+		else
+			path_put(&path);
 	}
 	return error;
 }
-- 
1.7.0.4

--

From: Christoph Hellwig
Date: Friday, August 20, 2010 - 1:32 am

Changing the interfaces of existing system calls to accept a NULL name
which previously wasn't acceptable is not valid.  Even for new system
calls I think it's a bad idea.  utimensat already has the most ugly
code of all fs related system calls because of that.

--

From: Neil Brown
Date: Friday, August 20, 2010 - 3:04 am

On Fri, 20 Aug 2010 04:32:13 -0400

Again, you have made an assertion without justifying it.
I disagree, I think it is perfectly valid.

The function would still return an error in every case that it previously
returned an error, because in every previous case the fd would not have
referred to a symlink so the new version will have nothing to perform a
readlink on when NULL is passed.  The error would probably be EINVAL rather
than EFAULT but even that difference could be avoided if it was really
important (I doubt it is).

You only get different behaviour if the path name is NULL and the fd refers
to a symlink.  This is a completely new situation for which there is no
precedent for how the syscall should behave.  We are free to create whatever
behaviour is most consistent.  I believe the proposed behaviour is the
correct one.

The code may be ugly, but I'm sure it can be tidied up, particularly if
several system-calls adopted the same calling convention.

NeilBrown
--

From: Aneesh Kumar K. V
Date: Friday, August 20, 2010 - 7:43 am

But from an interface point, it seems nice to say if the relative name
from an fd is NULL, the operation should happen on fd itself.

-aneesh
--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

This enables to use linkat to create hardlinks from a
file descriptor pointing to the file. This can be used
with open_by_handle syscall that returns a file descriptor.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/namei.c |   36 ++++++++++++++++++++++++++----------
 1 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2c79363..fd9febc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2627,7 +2627,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	struct inode *inode = old_dentry->d_inode;
 	int error;
 
-	if (!inode)
+	if (!inode || inode->i_nlink == 0)
 		return -ENOENT;
 
 	error = may_create(dir, new_dentry);
@@ -2673,16 +2673,28 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 {
 	struct dentry *new_dentry;
 	struct nameidata nd;
-	struct path old_path;
-	int error;
+	struct path old_path, *old_pathp;
+	struct file *file = NULL;
+	int error, fput_needed;
 	char *to;
 
 	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
 		return -EINVAL;
 
-	error = user_path_at(olddfd, oldname,
-			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-			     &old_path);
+	if (oldname == NULL && olddfd != AT_FDCWD) {
+		file = fget_light(olddfd, &fput_needed);
+		if (file) {
+			old_pathp = &file->f_path;
+			error = 0;
+		} else
+			error = -EBADF;
+	} else {
+		error = user_path_at(olddfd, oldname,
+				     flags & AT_SYMLINK_FOLLOW ?
+				     LOOKUP_FOLLOW : 0,
+				     &old_path);
+		old_pathp = &old_path;
+	}
 	if (error)
 		return error;
 
@@ -2690,7 +2702,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	if (error)
 		goto out;
 	error = -EXDEV;
-	if (old_path.mnt != nd.path.mnt)
+	if (old_pathp->mnt != nd.path.mnt)
 		goto out_release;
 	new_dentry = lookup_create(&nd, 0);
 	error = PTR_ERR(new_dentry);
@@ -2699,10 +2711,11 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	error = ...
From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

This patch adds sys_name_to_handle_at and sys_open_by_handle_at
syscalls to x86_32

Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/x86/include/asm/unistd_32.h   |    5 ++++-
 arch/x86/kernel/syscall_table_32.S |    2 ++
 2 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b766a5e..62e35b7 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,10 +346,13 @@
 #define __NR_fanotify_init	338
 #define __NR_fanotify_mark	339
 #define __NR_prlimit64		340
+#define __NR_name_to_handle_at	341
+#define __NR_open_by_handle_at  342
+
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 341
+#define NR_syscalls 343
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index b35786d..c314b21 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,3 +340,5 @@ ENTRY(sys_call_table)
 	.long sys_fanotify_init
 	.long sys_fanotify_mark
 	.long sys_prlimit64		/* 340 */
+	.long sys_name_to_handle_at
+	.long sys_open_by_handle_at
-- 
1.7.0.4

--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

Add sys_name_to_handle_at and sys_open_by_handle_at syscalls
for x86_64

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/x86/ia32/ia32entry.S        |    2 ++
 arch/x86/include/asm/unistd_64.h |    5 +++++
 fs/compat.c                      |   11 +++++++++++
 3 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b86feab..d93cd0d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -845,4 +845,6 @@ ia32_sys_call_table:
 	.quad sys_fanotify_init
 	.quad sys32_fanotify_mark
 	.quad sys_prlimit64		/* 340 */
+	.quad sys_name_to_handle_at
+	.quad compat_sys_open_by_handle_at
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8..24d8a38 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,11 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
 #define __NR_prlimit64				302
 __SYSCALL(__NR_prlimit64, sys_prlimit64)
+#define __NR_name_to_handle_at			303
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
+#define __NR_open_by_handle_at			304
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
+
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/fs/compat.c b/fs/compat.c
index 718c706..c4037fd 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2334,3 +2334,14 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 
 #endif /* CONFIG_TIMERFD */
+
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+			     struct file_handle __user *handle, int flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
-- 
1.7.0.4

--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 include/asm-generic/unistd.h |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index b969770..8e7bd87 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -646,9 +646,13 @@ __SYSCALL(__NR_prlimit64, sys_prlimit64)
 __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
 #define __NR_fanotify_mark 263
 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
+#define __NR_name_to_handle_at	264
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
+#define __NR_open_by_handle_at	265
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
 
 #undef __NR_syscalls
-#define __NR_syscalls 264
+#define __NR_syscalls 266
 
 /*
  * All syscalls below here should go away really,
-- 
1.7.0.4

--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

We add a per superblock uuid field. File systems should
update the uuid in the fill_super callback

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/namespace.c     |   16 ++++++++++++++++
 include/linux/fs.h |    1 +
 2 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb1..041645a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -837,6 +837,18 @@ const struct seq_operations mounts_op = {
 	.show	= show_vfsmnt
 };
 
+static int uuid_is_nil(u8 *uuid)
+{
+	int i;
+	u8  *cp = (u8 *)uuid;
+
+	for (i = 0; i < 16; i++) {
+		if (*cp++)
+			return 0;
+	}
+	return 1;
+}
+
 static int show_mountinfo(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = m->private;
@@ -875,6 +887,10 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	if (IS_MNT_UNBINDABLE(mnt))
 		seq_puts(m, " unbindable");
 
+	if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+		/* print the uuid */
+		seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
+
 	/* Filesystem specific data */
 	seq_puts(m, " - ");
 	show_type(m, sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3eb3571..06654e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1370,6 +1370,7 @@ struct super_block {
 	wait_queue_head_t	s_wait_unfrozen;
 
 	char s_id[32];				/* Informational name */
+	u8 s_uuid[16];				/* UUID */
 
 	void 			*s_fs_info;	/* Filesystem private info */
 	fmode_t			s_mode;
-- 
1.7.0.4

--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext3/super.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dbf4db..6dda322 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1918,6 +1918,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext3_qctl_operations;
 	sb->dq_op = &ext3_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
1.7.0.4

--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/ext4/super.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2614774..b46a78c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2941,6 +2941,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext4_qctl_operations;
 	sb->dq_op = &ext4_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
1.7.0.4

--

From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

This enables to use faccessat to get the access check details
from a file descriptor pointing to the file. This can be used
with open_by_handle syscall that returns a file descriptor.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/open.c |   29 +++++++++++++++++++++--------
 1 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index 7abdcba..f907b50 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -288,9 +288,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 {
 	const struct cred *old_cred;
 	struct cred *override_cred;
-	struct path path;
+	struct file *file = NULL;
+	struct path path, *pp;
 	struct inode *inode;
-	int res;
+	int res, fput_needed;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 		return -EINVAL;
@@ -312,12 +313,21 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 	}
 
 	old_cred = override_creds(override_cred);
-
-	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+	if (filename == NULL  && dfd != AT_FDCWD) {
+		file = fget_light(dfd, &fput_needed);
+		if (file) {
+			pp = &file->f_path;
+			res = 0;
+		} else
+			res = -EBADF;
+	} else {
+		res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+		pp = &path;
+	}
 	if (res)
 		goto out;
 
-	inode = path.dentry->d_inode;
+	inode = pp->dentry->d_inode;
 
 	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		/*
@@ -325,7 +335,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 		 * with the "noexec" flag.
 		 */
 		res = -EACCES;
-		if (path.mnt->mnt_flags & MNT_NOEXEC)
+		if (pp->mnt->mnt_flags & MNT_NOEXEC)
 			goto out_path_release;
 	}
 
@@ -343,11 +353,14 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 	 * inherently racy and know that the fs may change
 	 * state before we even see this result.
 	 */
-	if (__mnt_is_readonly(path.mnt))
+	if ...
From: Aneesh Kumar K.V
Date: Thursday, August 19, 2010 - 6:51 pm

The syscall also return mount id which can be used
to lookup file system specific information such as uuid
in /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/open.c                |  129 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/exportfs.h |    3 +
 include/linux/fs.h       |    8 +++
 include/linux/syscalls.h |    5 ++-
 4 files changed, 144 insertions(+), 1 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index 630715f..efb1806 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,7 @@
 #include <linux/fs_struct.h>
 #include <linux/ima.h>
 #include <linux/dnotify.h>
+#include <linux/exportfs.h>
 
 #include "internal.h"
 
@@ -1042,3 +1043,131 @@ int nonseekable_open(struct inode *inode, struct file *filp)
 }
 
 EXPORT_SYMBOL(nonseekable_open);
+
+#ifdef CONFIG_EXPORTFS
+static long do_sys_name_to_handle(struct path *path,
+				  struct file_handle __user *ufh,
+				  int __user *mnt_id)
+{
+	long retval;
+	int handle_size;
+	struct file_handle f_handle;
+	struct file_handle *handle = NULL;
+
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+		retval = -EFAULT;
+		goto err_out;
+	}
+	if (f_handle.handle_size > MAX_HANDLE_SZ) {
+		retval = -EINVAL;
+		goto err_out;
+	}
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size,
+			 GFP_KERNEL);
+	if (!handle) {
+		retval = -ENOMEM;
+		goto err_out;
+	}
+
+	/* convert handle size to  multiple of sizeof(u32) */
+	handle_size = f_handle.handle_size >> 2;
+
+	/* we ask for a non connected handle */
+	retval = exportfs_encode_fh(path->dentry,
+				    (struct fid *)handle->f_handle,
+				    &handle_size,  0);
+	/* convert handle size to bytes */
+	handle_size *= sizeof(u32);
+	handle->handle_type = retval;
+	handle->handle_size = handle_size;
+	if (handle_size > f_handle.handle_size) {
+		/*
+		 * set the handle_size to zero so we copy only
+		 * non variable part of the file_handle
+		 ...
Previous thread: linux-next: build warning after merge of the input tree by Stephen Rothwell on Thursday, August 19, 2010 - 6:33 pm. (1 message)

Next thread: [PATCH 1/4] regulator/wm831x-dcdc: fix potential NULL dereference by Axel Lin on Thursday, August 19, 2010 - 6:58 pm. (6 messages)