Hi, The below set of patches implement open by handle support using exportfs operations. This allows user space application to map a file name to file handle and later open the file using handle. This should be usable for userspace NFS [1] and 9P server [2]. XFS already support this with the ioctls XFS_IOC_PATH_TO_HANDLE and XFS_IOC_OPEN_BY_HANDLE. [1] http://nfs-ganesha.sourceforge.net/ [2] http://thread.gmane.org/gmane.comp.emulators.qemu/68992 git repo for the patchset at: git://git.kernel.org/pub/scm/linux/kernel/git/kvaneesh/linux-open-handle.git open-by-handle Changes from V17: a) Add syscall number to asm-generic/unistd.h c) Fix wrong put_filep usage in do_handle_open Changes from V16: a) mnt_id is now an arg to syscall b) finish_open_handle is removed. Update finish_open to work with struct path and struct file. c) Add O_NOACCESS and use that for open_by_handle on symlink Changes from V15: a) Add mount id to file handle b) Add support for optional uuid tag in /proc/<pid>/mountinfo c) Added MAY_OPEN_LINK to support open on symlink instead of adding a new may_open function. Also limited the open flag to O_RDONLY. Changes from V14: a) Use fget_light instead of fget in the patches b) Drop uuid from struct file_handle as per the last review Changes from V13: a) Add support for file descriptor to handle conversion. This is needed so that we find the right file handle for newly created files. Changes from V12: a) Use CAP_DAC_READ_SEARCH instead of CAP_DAC_OVERRIDE in open_by_handle b) Return -ENOTDIR if O_DIRECTORY flag is specified in open_by_handle with handle for non directory Changes from V11: a) Add necessary documentation to different functions b) Add null pathname support to faccessat and linkat similar to readlinkat. c) compile fix on x86_64 Changes from V10: a) Missed an stg refresh before sending out the patchset. Send updated patchset. Changes from V9: a) Fix compile errors with CONFIG_EXPORTFS not ...
The exportfs encode handle function should return the minimum required
handle size. This helps user to find out the handle size by passing 0
handle size in the first step and then redoing to the call again with
the returned handle size value.
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/btrfs/export.c | 8 ++++++--
fs/exportfs/expfs.c | 9 +++++++--
fs/fat/inode.c | 4 +++-
fs/fuse/inode.c | 4 +++-
fs/gfs2/export.c | 8 ++++++--
fs/isofs/export.c | 8 ++++++--
fs/ocfs2/export.c | 8 +++++++-
fs/reiserfs/inode.c | 7 ++++++-
fs/udf/namei.c | 7 ++++++-
fs/xfs/linux-2.6/xfs_export.c | 4 +++-
include/linux/exportfs.h | 6 ++++--
mm/shmem.c | 4 +++-
12 files changed, 60 insertions(+), 17 deletions(-)
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09..5f8ee5a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
int len = *max_len;
int type;
- if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
- (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+ if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE;
return 255;
+ } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+ *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ return 255;
+ }
len = BTRFS_FID_SIZE_NON_CONNECTABLE;
type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e1759..cfee0f0 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -319,9 +319,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
struct inode * inode = dentry->d_inode;
int len = *max_len;
int type = FILEID_INO32_GEN;
-
- if (len < 2 || (connectable && len ...Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/exportfs/expfs.c | 2 +
fs/namei.c | 220 +++++++++++++++++++++++++++++++++++++++++++---
fs/open.c | 32 ++++++-
include/linux/fs.h | 8 ++-
include/linux/namei.h | 1 +
include/linux/syscalls.h | 3 +
6 files changed, 247 insertions(+), 19 deletions(-)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cfee0f0..05a1179 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -373,6 +373,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
/*
* Try to get any dentry for the given file handle from the filesystem.
*/
+ if (!nop || !nop->fh_to_dentry)
+ return ERR_PTR(-ESTALE);
result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
if (!result)
result = ERR_PTR(-ESTALE);
diff --git a/fs/namei.c b/fs/namei.c
index 17ea76b..1291dfc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
+#include <linux/exportfs.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -1042,6 +1043,29 @@ out_fail:
return retval;
}
+struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+ int fput_needed;
+ struct path *path;
+ struct file *filep;
+
+ if (fd == AT_FDCWD) {
+ struct fs_struct *fs = current->fs;
+ read_lock(&fs->lock);
+ path = &fs->pwd;
+ mntget(path->mnt);
+ read_unlock(&fs->lock);
+ } else {
+ filep = fget_light(fd, &fput_needed);
+ if (!filep)
+ return ERR_PTR(-EBADF);
+ path = &filep->f_path;
+ mntget(path->mnt);
+ fput_light(filep, fput_needed);
+ }
+ return path->mnt;
+}
+
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
@@ -1546,26 +1570,30 @@ static int open_will_truncate(int flag, struct inode *inode)
return (flag & ...The patch update may_open to allow handle based open on symlinks.
The file handle based API use file descritor returned from open_by_handle_at
to do different file system operations. To find the link target name we
need to get a file descriptor on symlinks.
We should be able to read the link target using file handle. The exact
usecase is with respect to implementing READLINK operation on a
userspace NFS server. The request contain the file handle and the
response include target name.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/namei.c | 24 ++++++++++++++++++++++--
include/asm-generic/fcntl.h | 1 +
include/linux/fs.h | 1 +
3 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 1291dfc..2c79363 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1447,7 +1447,20 @@ int may_open(struct path *path, int acc_mode, int flag)
switch (inode->i_mode & S_IFMT) {
case S_IFLNK:
- return -ELOOP;
+ /*
+ * Allow only if acc_mode contain
+ * open link request.
+ */
+ if (!(acc_mode & MAY_OPEN_LINK))
+ return -ELOOP;
+ /*
+ * Allow open on symlink only with
+ * open flag O_NOACCESS. We don't
+ * allow read/write on symlinks
+ */
+ if (flag != O_NOACCESS)
+ return -ELOOP;
+ break;
case S_IFDIR:
if (acc_mode & MAY_WRITE)
return -EISDIR;
@@ -2038,8 +2051,15 @@ long do_handle_open(int mountdirfd,
*/
if (open_flag & __O_SYNC)
open_flag |= O_DSYNC;
+ /*
+ * Handle based API allow open on a symlink
+ */
+ if (S_ISLNK(path->dentry->d_inode->i_mode))
+ acc_mode = MAY_OPEN_LINK;
+ else
+ acc_mode = MAY_OPEN;
- acc_mode = MAY_OPEN | ACC_MODE(open_flag);
+ acc_mode |= ACC_MODE(open_flag);
/* O_TRUNC implies we need access checks for write permissions */
if (open_flag & O_TRUNC)
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index a70b2d2..9d8f455 100644
--- a/include/asm-generic/fcntl.h
+++ ...If we don't do this, we will need new syscalls for doing stat, readlink and link on symlinks that take file handle as the argument. If that is ok, I can redo the patch series accordingly. -aneesh --
This ended up with the below new syscalls sys_handle_readlink, sys_handle_stat64, sys_handle_link, sys_handle_chown, sys_handle_setxattr, sys_handle_getxattr, sys_handle_listxattr, sys_handle_removexattr, sys_handle_utimes, compat_sys_handle_utimes -aneesh --
Suddenly getting an file pointer for a symlink which could never happen before is a really bad idea. Just add a proper readlink_by_handle system call, similar to what's done in the XFS interface. --
On Fri, 20 Aug 2010 04:30:57 -0400 Why is that? With futexes we suddenly get a file descriptor for something we could never get a file descriptor on before and that doesn't seem to be a problem. Why should symlinks be special as the only thing that you cannot have a file descriptor for? Uniformity of interface is a very valuable property. NeilBrown --
You are welcome to review the codepaths around pathname resolution for assumptions of presense of ->follow_link() and friends; there _are_ subtle cases and dumping your "opened symlinks" in there is far from a trivial change. Note that it affects more than just the starting points of lookups; /proc/*/fd/* stuff is also involved. BTW, speaking of NULL pathname, linkat() variant that allows creating a link to an opened file is also a very dubious thing; at the very least, you get non-trivial security implications, since now a process that got an opened descriptor passed to it by somebody else may create hardlinks to the sucker. --
[[email address for Nick Piggin changed to npiggin@kernel.dk]] On Fri, 20 Aug 2010 12:51:35 +0100 So as I understand it you aren't rejecting the concept in principle, but you believe non-trivial code review is required before it can be considered an acceptable change? Fair comment, and while one could imagine ways around this (such as requiring some Capability to link an fd) they wouldn't be very elegant. But then nor is inventing a pile of new syscalls for doing different things with handles such as the list Aneesh posted. Maybe a different approach is needed. How about a new AT flag: AT_FILE_HANDLE Meaning is that the 'dirfd' is used only to identify a filesystem (vfsmnt) and the 'name' pointer actually points to a filehandle fragment interpreted in that filesystem. One problem is that there is no way to pass the length... Options: fragment is at most 64 bytes nul padded at the end fragment is hex encoded and nul terminated ?? I think I prefer the hex encoding, but I'm hoping someone else has a better idea. NeilBrown --
Thanks, I had both of the same concerns as Christoph with API change and exposing symlink fds last time I looked at the patces, actually. But they can probably be worked around or avoided. I think the more important thing is whether it is worth supporting. This is all restricted to root (or CAP_DAC_READ_SEARCH) only, right, and what exact semantics they want. I would like to see more discussion of what this enables and some results. For the case of avoiding expensive network revalidations in path name lookup, do we even need to open symlinks? Could the security issues be avoided by always having handle attached to an open fd? --
For implementing a userspace file server that use handle for representing files (like NFS) we would require to have the ability to do different file system operations that can operate on symlink to work on handle too. -aneesh --
linkat change to take NULL path name is needed for regular files also. The file server will get a request to create a link with handle of oldpath and directory handle of new path and new name. The changes would also help in the implementation of 9P server in Qemu for implementing a fast virtualization passthrough file system (aka VirtFS) http://thread.gmane.org/gmane.comp.emulators.qemu/68992 -aneesh --
Right. Is this a really important goal, I'm wondering? Is it realistic (ie. to be able to remove the nfs server from the kernel)? --
The feature is also needed to implement a 9p virtio pass through file system in Qemu http://thread.gmane.org/gmane.comp.emulators.qemu/68992 -aneesh --
I think so. The hardest part is if you want to export a filesystem that also has non-nfs users--in the kernel it's much easier to maintain consistency between local users of the filesystem and nfs clients. If the server has no local users, then userspace may turn out to be easier. In that case you might ask why the server needs a real filesystem to export at all--in theory it could maintain all its data on its own. But I suspect if you try that you find you're reinventing a lot of wheels. --b. --
On Sat, 21 Aug 2010 18:30:24 +1000 They allow a credible user-space implementation of the server for some I don't see what you are getting at here... which particular security isses, and what fd would you use? As I understand it there are only two security issues that have been noted. 1/ lookup-by-filehandle can bypass any 'search' permission tests on ancestor directories. I cannot see any way to avoid this except require CAP_DAC_READ_SEARCH 2/ Creating a hardlink to an 'fd' allows a process that was given an 'fd' that it could not have opened itself to prevent that file from being removed (and space reclaimed) by creating a private hardlink. This could be avoided by requiring CAP_DAC_READ_SEARCH for that particular operation (and probably requiring i_nlink > 0 anyway) but that feels like a very special-case restriction. Was it one of these that you were referring to? Thanks, --
Why would that be an issue ? Even though a hardlink can be created, only process with right privileges can access them. One problem is; being able to create hardlinks at different directory location as above implies that there is no way to guarantee that the file got completely removed from the system. Is this the security risk that you are pointing -aneesh --
struct file_handle already include length -aneesh --
On Sat, 21 Aug 2010 01:13:52 -0600
We already parse filenames into components separated by '/'. Is HEX decoding
that much more ugly.
Filehandles are currently passed between the kernel and mountd as HEX
That part of a filehandle that nfsd gives to the filesystem is one byte out
of a 4-byte header, plus the tail of the filehandle after the part that
identifies the filesystem.
This 'one byte' does imply the length, but it doesn't necessarily encode it.
Rather it is a 'type'. So it cannot really be used to determine the length
at the point when the filehandle would need to be copied from userspace into
the kernel.
I don't think there is any precedent for passing a 4-byte length followed by
a binary string, while there is plenty of precedent for passing a
nul-terminated ASCII string.
[[ Following this approach I would like to avoid any filehandle-specific
syscalls altogether.
Just use a *at syscall with AT_FILE_HANDLE for filehandle lookup, and use
getxattr('system:linux.file_handle') to get the filehandle for a given path.
Ofcourse we would need to at *at versions of the *xattr syscalls, but that is
probably a good idea anyway.
]]
--
There are at* syscalls that doesn't take the additional flags as the argument, like openat, readlinkat. How will handle based open and readlink work with the above interface ? -aneesh --
On Mon, 23 Aug 2010 06:54:03 +0530 Bother... you are right. I had remembered that at the time that all that *at calls were added there was discussion about how you always need some flags, particularly in the context of adding O_CLOEXEC and (I thought) a flag to allow non-sequential allocation of fds. I had thought that they all got 'flags' arguments as a result, but it seems not. For openat you could squeeze something into the current 'flags' arg (O_FILE_HANDLE), but for readlinkat, symlinkat at least there is no such option. Sad really. NeilBrown --
IMHO that is really bad overloading of flags value. I also find hex encoded handle in pathname argument of *at syscalls confusing. The above discussion also hint that we would need a new syscall for open, readlink and setxattr, So how about me posting the new series which remove open on symlink patch and add a bunch of syscalls to allow operation on symlinks based on handle ? -aneesh --
sys_setxattrat would take 7 arguments. -aneesh --
On Mon, 23 Aug 2010 08:19:48 +0530 I count 6 - assuming we share the 'flags' argument. There are only 2 xattr flags currently so there should be room to share. dirfd path name value size flags == 6 NeilBrown --
Why ot creating a special file system for this kind of operation ? I mean a vfsmnt filesystem, with each directory on the root is a symlink to the root of the real vfsmnt root ? I could be even be in proc space like /proc/self/vfsmnt path_to_handle will return a relative path from this directory like 0x75843558/somehandle (if X is on /usr/bin/X and usr is mounted by filesystem 0x75843558) path_to_fshandle() will return 0x75843558 opening file handle will be just a matter to thus open /proc/self/vfsmount/0x75843558/somehandle Permission will be determined by vfsmount filesystem. No need to create new syscall all te handle to filename will be handle by the vfsmount filesystem We could even use at existing command. The dirfd will need to be only /proc/self/vfsmnt (and if you need to get a fd without mounting /proc create a syscall to get this fd). Does sound plausible ? Bastien --
On Tue, 24 Aug 2010 11:41:10 +0200 I don't think so. I'm not 100% certain what you are proposing, but I think the basic idea is a virtual filesystem where giving a textual filehandle as a name gives access to the file with that filehandle. This could only work by creating a virtual symlink from the name to the object in whichever filesystem - somewhat like /proc/self/fd/*. This could be used to open the file, not to create a hard-link or read a symlink which are two of the issues we are struggling with. Maybe I have misunderstood you though. NeilBrown --
This issue could be raised by the open O_NODE patch (http://lwn.net/Articles/364735/). For symlink it will allow to read the symlink. For hard link some bsd have fsdb that allow this kind of stuff. in our case the security implication are huge, We could undelete a removed file. In fact you need a flink(fd, "new path"). I could be useful in some context note but seriously restricted. Bastien --
FWIW
commit 82af7aca56c67061420d618cc5a30f0fd4106b80
Author: Eric Sesterhenn <snakebyte@gmx.de>
Date: Fri Jan 25 10:40:46 2008 +0100
Removal of FUTEX_FD
--
On Fri, 20 Aug 2010 15:25:29 +0200
Interesting - thanks.
OK, adjust my argument to reference signalfd, timerfs and pollfd.
I haven't looked at that code before - "anon_inode_getfd" gives you an fd on
an inode for which
(stat.st_mode & S_IFMT) == 0
Cool! They have no format at all :-)
NeilBrown
--
It is not just readlink that we would need. We would require sys_handle_readlink, sys_handle_stat64, sys_handle_link, sys_handle_chown, sys_handle_setxattr, sys_handle_getxattr, sys_handle_listxattr, sys_handle_removexattr, sys_handle_utimes, compat_sys_handle_utimes I have most of the changes done, so if we are ok adding all these new syscalls i can post the patch series. -aneesh --
From: NeilBrown <neilb@suse.de>
This enables to use readlink to get the link target name
from a file descriptor point to the link. This can be used
with open_by_handle syscall that returns a file descriptor for a link.
We can then use this file descriptor to get the target name.
This is similar to utimensat(2) interface
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/stat.c | 30 ++++++++++++++++++++++--------
1 files changed, 22 insertions(+), 8 deletions(-)
diff --git a/fs/stat.c b/fs/stat.c
index 12e90e2..3723a9f 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -289,26 +289,40 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
char __user *, buf, int, bufsiz)
{
- struct path path;
- int error;
+ int error = 0, fput_needed;
+ struct path path, *pp;
+ struct file *file = NULL;
if (bufsiz <= 0)
return -EINVAL;
- error = user_path_at(dfd, pathname, 0, &path);
+ if (pathname == NULL && dfd != AT_FDCWD) {
+ file = fget_light(dfd, &fput_needed);
+
+ if (file)
+ pp = &file->f_path;
+ else
+ error = -EBADF;
+ } else {
+ error = user_path_at(dfd, pathname, 0, &path);
+ pp = &path;
+ }
if (!error) {
- struct inode *inode = path.dentry->d_inode;
+ struct inode *inode = pp->dentry->d_inode;
error = -EINVAL;
if (inode->i_op->readlink) {
- error = security_inode_readlink(path.dentry);
+ error = security_inode_readlink(pp->dentry);
if (!error) {
- touch_atime(path.mnt, path.dentry);
- error = inode->i_op->readlink(path.dentry,
+ touch_atime(pp->mnt, pp->dentry);
+ error = inode->i_op->readlink(pp->dentry,
buf, bufsiz);
}
}
- path_put(&path);
+ if (file)
+ fput_light(file, fput_needed);
+ else
+ path_put(&path);
}
return error;
}
--
1.7.0.4
--
Changing the interfaces of existing system calls to accept a NULL name which previously wasn't acceptable is not valid. Even for new system calls I think it's a bad idea. utimensat already has the most ugly code of all fs related system calls because of that. --
On Fri, 20 Aug 2010 04:32:13 -0400 Again, you have made an assertion without justifying it. I disagree, I think it is perfectly valid. The function would still return an error in every case that it previously returned an error, because in every previous case the fd would not have referred to a symlink so the new version will have nothing to perform a readlink on when NULL is passed. The error would probably be EINVAL rather than EFAULT but even that difference could be avoided if it was really important (I doubt it is). You only get different behaviour if the path name is NULL and the fd refers to a symlink. This is a completely new situation for which there is no precedent for how the syscall should behave. We are free to create whatever behaviour is most consistent. I believe the proposed behaviour is the correct one. The code may be ugly, but I'm sure it can be tidied up, particularly if several system-calls adopted the same calling convention. NeilBrown --
But from an interface point, it seems nice to say if the relative name from an fd is NULL, the operation should happen on fd itself. -aneesh --
This enables to use linkat to create hardlinks from a
file descriptor pointing to the file. This can be used
with open_by_handle syscall that returns a file descriptor.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/namei.c | 36 ++++++++++++++++++++++++++----------
1 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 2c79363..fd9febc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2627,7 +2627,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
struct inode *inode = old_dentry->d_inode;
int error;
- if (!inode)
+ if (!inode || inode->i_nlink == 0)
return -ENOENT;
error = may_create(dir, new_dentry);
@@ -2673,16 +2673,28 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
{
struct dentry *new_dentry;
struct nameidata nd;
- struct path old_path;
- int error;
+ struct path old_path, *old_pathp;
+ struct file *file = NULL;
+ int error, fput_needed;
char *to;
if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
return -EINVAL;
- error = user_path_at(olddfd, oldname,
- flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
- &old_path);
+ if (oldname == NULL && olddfd != AT_FDCWD) {
+ file = fget_light(olddfd, &fput_needed);
+ if (file) {
+ old_pathp = &file->f_path;
+ error = 0;
+ } else
+ error = -EBADF;
+ } else {
+ error = user_path_at(olddfd, oldname,
+ flags & AT_SYMLINK_FOLLOW ?
+ LOOKUP_FOLLOW : 0,
+ &old_path);
+ old_pathp = &old_path;
+ }
if (error)
return error;
@@ -2690,7 +2702,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
if (error)
goto out;
error = -EXDEV;
- if (old_path.mnt != nd.path.mnt)
+ if (old_pathp->mnt != nd.path.mnt)
goto out_release;
new_dentry = lookup_create(&nd, 0);
error = PTR_ERR(new_dentry);
@@ -2699,10 +2711,11 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
error = ...This patch adds sys_name_to_handle_at and sys_open_by_handle_at syscalls to x86_32 Acked-by: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- arch/x86/include/asm/unistd_32.h | 5 ++++- arch/x86/kernel/syscall_table_32.S | 2 ++ 2 files changed, 6 insertions(+), 1 deletions(-) diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index b766a5e..62e35b7 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -346,10 +346,13 @@ #define __NR_fanotify_init 338 #define __NR_fanotify_mark 339 #define __NR_prlimit64 340 +#define __NR_name_to_handle_at 341 +#define __NR_open_by_handle_at 342 + #ifdef __KERNEL__ -#define NR_syscalls 341 +#define NR_syscalls 343 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786d..c314b21 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,5 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_name_to_handle_at + .long sys_open_by_handle_at -- 1.7.0.4 --
Add sys_name_to_handle_at and sys_open_by_handle_at syscalls
for x86_64
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/x86/ia32/ia32entry.S | 2 ++
arch/x86/include/asm/unistd_64.h | 5 +++++
fs/compat.c | 11 +++++++++++
3 files changed, 18 insertions(+), 0 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b86feab..d93cd0d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -845,4 +845,6 @@ ia32_sys_call_table:
.quad sys_fanotify_init
.quad sys32_fanotify_mark
.quad sys_prlimit64 /* 340 */
+ .quad sys_name_to_handle_at
+ .quad compat_sys_open_by_handle_at
ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 363e9b8..24d8a38 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,6 +669,11 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
#define __NR_prlimit64 302
__SYSCALL(__NR_prlimit64, sys_prlimit64)
+#define __NR_name_to_handle_at 303
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
+#define __NR_open_by_handle_at 304
+__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
+
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/fs/compat.c b/fs/compat.c
index 718c706..c4037fd 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2334,3 +2334,14 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
}
#endif /* CONFIG_TIMERFD */
+
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+ struct file_handle __user *handle, int flags)
+{
+ return do_handle_open(mountdirfd, handle, flags);
+}
--
1.7.0.4
--
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- include/asm-generic/unistd.h | 6 +++++- 1 files changed, 5 insertions(+), 1 deletions(-) diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index b969770..8e7bd87 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -646,9 +646,13 @@ __SYSCALL(__NR_prlimit64, sys_prlimit64) __SYSCALL(__NR_fanotify_init, sys_fanotify_init) #define __NR_fanotify_mark 263 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) +#define __NR_name_to_handle_at 264 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 265 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #undef __NR_syscalls -#define __NR_syscalls 264 +#define __NR_syscalls 266 /* * All syscalls below here should go away really, -- 1.7.0.4 --
We add a per superblock uuid field. File systems should
update the uuid in the fill_super callback
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/namespace.c | 16 ++++++++++++++++
include/linux/fs.h | 1 +
2 files changed, 17 insertions(+), 0 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb1..041645a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -837,6 +837,18 @@ const struct seq_operations mounts_op = {
.show = show_vfsmnt
};
+static int uuid_is_nil(u8 *uuid)
+{
+ int i;
+ u8 *cp = (u8 *)uuid;
+
+ for (i = 0; i < 16; i++) {
+ if (*cp++)
+ return 0;
+ }
+ return 1;
+}
+
static int show_mountinfo(struct seq_file *m, void *v)
{
struct proc_mounts *p = m->private;
@@ -875,6 +887,10 @@ static int show_mountinfo(struct seq_file *m, void *v)
if (IS_MNT_UNBINDABLE(mnt))
seq_puts(m, " unbindable");
+ if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+ /* print the uuid */
+ seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
+
/* Filesystem specific data */
seq_puts(m, " - ");
show_type(m, sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3eb3571..06654e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1370,6 +1370,7 @@ struct super_block {
wait_queue_head_t s_wait_unfrozen;
char s_id[32]; /* Informational name */
+ u8 s_uuid[16]; /* UUID */
void *s_fs_info; /* Filesystem private info */
fmode_t s_mode;
--
1.7.0.4
--
File system UUID is made available to application via /proc/<pid>/mountinfo Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- fs/ext3/super.c | 1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 5dbf4db..6dda322 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1918,6 +1918,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); -- 1.7.0.4 --
File system UUID is made available to application via /proc/<pid>/mountinfo Acked-by: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> --- fs/ext4/super.c | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2614774..b46a78c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2941,6 +2941,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); -- 1.7.0.4 --
This enables to use faccessat to get the access check details
from a file descriptor pointing to the file. This can be used
with open_by_handle syscall that returns a file descriptor.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/open.c | 29 +++++++++++++++++++++--------
1 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 7abdcba..f907b50 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -288,9 +288,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
const struct cred *old_cred;
struct cred *override_cred;
- struct path path;
+ struct file *file = NULL;
+ struct path path, *pp;
struct inode *inode;
- int res;
+ int res, fput_needed;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
return -EINVAL;
@@ -312,12 +313,21 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
}
old_cred = override_creds(override_cred);
-
- res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+ if (filename == NULL && dfd != AT_FDCWD) {
+ file = fget_light(dfd, &fput_needed);
+ if (file) {
+ pp = &file->f_path;
+ res = 0;
+ } else
+ res = -EBADF;
+ } else {
+ res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+ pp = &path;
+ }
if (res)
goto out;
- inode = path.dentry->d_inode;
+ inode = pp->dentry->d_inode;
if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
/*
@@ -325,7 +335,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
* with the "noexec" flag.
*/
res = -EACCES;
- if (path.mnt->mnt_flags & MNT_NOEXEC)
+ if (pp->mnt->mnt_flags & MNT_NOEXEC)
goto out_path_release;
}
@@ -343,11 +353,14 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
* inherently racy and know that the fs may change
* state before we even see this result.
*/
- if (__mnt_is_readonly(path.mnt))
+ if ...The syscall also return mount id which can be used
to lookup file system specific information such as uuid
in /proc/<pid>/mountinfo
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
fs/open.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/exportfs.h | 3 +
include/linux/fs.h | 8 +++
include/linux/syscalls.h | 5 ++-
4 files changed, 144 insertions(+), 1 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 630715f..efb1806 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,7 @@
#include <linux/fs_struct.h>
#include <linux/ima.h>
#include <linux/dnotify.h>
+#include <linux/exportfs.h>
#include "internal.h"
@@ -1042,3 +1043,131 @@ int nonseekable_open(struct inode *inode, struct file *filp)
}
EXPORT_SYMBOL(nonseekable_open);
+
+#ifdef CONFIG_EXPORTFS
+static long do_sys_name_to_handle(struct path *path,
+ struct file_handle __user *ufh,
+ int __user *mnt_id)
+{
+ long retval;
+ int handle_size;
+ struct file_handle f_handle;
+ struct file_handle *handle = NULL;
+
+ if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+ retval = -EFAULT;
+ goto err_out;
+ }
+ if (f_handle.handle_size > MAX_HANDLE_SZ) {
+ retval = -EINVAL;
+ goto err_out;
+ }
+ handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_size,
+ GFP_KERNEL);
+ if (!handle) {
+ retval = -ENOMEM;
+ goto err_out;
+ }
+
+ /* convert handle size to multiple of sizeof(u32) */
+ handle_size = f_handle.handle_size >> 2;
+
+ /* we ask for a non connected handle */
+ retval = exportfs_encode_fh(path->dentry,
+ (struct fid *)handle->f_handle,
+ &handle_size, 0);
+ /* convert handle size to bytes */
+ handle_size *= sizeof(u32);
+ handle->handle_type = retval;
+ handle->handle_size = handle_size;
+ if (handle_size > f_handle.handle_size) {
+ /*
+ * set the handle_size to zero so we copy only
+ * non variable part of the file_handle
+ ...| Greg KH | Og dreams of kernels |
| Jens Axboe | [PATCH 31/33] Fusion: sg chaining support |
| Arnd Bergmann | Re: finding your own dead "CONFIG_" variables |
| Mark Brown | [PATCH 2/2] Subject: natsemi: Allow users to disable workaround for DspCfg reset |
| Tony Breeds | [LGUEST] Look in object dir for .config |
git: | |
| Brian Downing | Re: Git in a Nutshell guide |
| John Benes | Re: master has some toys |
| Matthias Lederhofer |
