Skip to content

Commit a07b200

Browse files
author
Al Viro
committed
vfs: syscall: Add open_tree(2) to reference or clone a mount
open_tree(dfd, pathname, flags) Returns an O_PATH-opened file descriptor or an error. dfd and pathname specify the location to open, in usual fashion (see e.g. fstatat(2)). flags should be an OR of some of the following: * AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW - same meanings as usual * OPEN_TREE_CLOEXEC - make the resulting descriptor close-on-exec * OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE - instead of opening the location in question, create a detached mount tree matching the subtree rooted at location specified by dfd/pathname. With AT_RECURSIVE the entire subtree is cloned, without it - only the part within in the mount containing the location in question. In other words, the same as mount --rbind or mount --bind would've taken. The detached tree will be dissolved on the final close of obtained file. Creation of such detached trees requires the same capabilities as doing mount --bind. Signed-off-by: Al Viro <[email protected]> Signed-off-by: David Howells <[email protected]> cc: [email protected] Signed-off-by: Al Viro <[email protected]>
1 parent 9e98c67 commit a07b200

File tree

9 files changed

+159
-28
lines changed

9 files changed

+159
-28
lines changed

arch/x86/entry/syscalls/syscall_32.tbl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,8 @@
398398
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
399399
385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents
400400
386 i386 rseq sys_rseq __ia32_sys_rseq
401-
# don't use numbers 387 through 392, add new calls at the end
401+
387 i386 open_tree sys_open_tree __ia32_sys_open_tree
402+
# don't use numbers 388 through 392, add new calls at the end
402403
393 i386 semget sys_semget __ia32_sys_semget
403404
394 i386 semctl sys_semctl __ia32_compat_sys_semctl
404405
395 i386 shmget sys_shmget __ia32_sys_shmget

arch/x86/entry/syscalls/syscall_64.tbl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@
343343
332 common statx __x64_sys_statx
344344
333 common io_pgetevents __x64_sys_io_pgetevents
345345
334 common rseq __x64_sys_rseq
346+
335 common open_tree __x64_sys_open_tree
346347
# don't use numbers 387 through 423, add new calls after the last
347348
# 'common' entry
348349
424 common pidfd_send_signal __x64_sys_pidfd_send_signal

fs/file_table.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ static void __fput(struct file *file)
255255
struct dentry *dentry = file->f_path.dentry;
256256
struct vfsmount *mnt = file->f_path.mnt;
257257
struct inode *inode = file->f_inode;
258+
fmode_t mode = file->f_mode;
258259

259260
if (unlikely(!(file->f_mode & FMODE_OPENED)))
260261
goto out;
@@ -277,18 +278,20 @@ static void __fput(struct file *file)
277278
if (file->f_op->release)
278279
file->f_op->release(inode, file);
279280
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
280-
!(file->f_mode & FMODE_PATH))) {
281+
!(mode & FMODE_PATH))) {
281282
cdev_put(inode->i_cdev);
282283
}
283284
fops_put(file->f_op);
284285
put_pid(file->f_owner.pid);
285-
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
286+
if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
286287
i_readcount_dec(inode);
287-
if (file->f_mode & FMODE_WRITER) {
288+
if (mode & FMODE_WRITER) {
288289
put_write_access(inode);
289290
__mnt_drop_write(mnt);
290291
}
291292
dput(dentry);
293+
if (unlikely(mode & FMODE_NEED_UNMOUNT))
294+
dissolve_on_fput(mnt);
292295
mntput(mnt);
293296
out:
294297
file_free(file);

fs/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ extern int __mnt_want_write_file(struct file *);
9494
extern void __mnt_drop_write(struct vfsmount *);
9595
extern void __mnt_drop_write_file(struct file *);
9696

97+
extern void dissolve_on_fput(struct vfsmount *);
9798
/*
9899
* fs_struct.c
99100
*/

fs/namespace.c

Lines changed: 135 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <linux/init.h> /* init_rootfs */
2121
#include <linux/fs_struct.h> /* get_fs_root et.al. */
2222
#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23+
#include <linux/file.h>
2324
#include <linux/uaccess.h>
2425
#include <linux/proc_ns.h>
2526
#include <linux/magic.h>
@@ -1832,6 +1833,21 @@ struct vfsmount *collect_mounts(const struct path *path)
18321833
return &tree->mnt;
18331834
}
18341835

1836+
static void free_mnt_ns(struct mnt_namespace *);
1837+
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
1838+
1839+
void dissolve_on_fput(struct vfsmount *mnt)
1840+
{
1841+
struct mnt_namespace *ns;
1842+
namespace_lock();
1843+
lock_mount_hash();
1844+
ns = real_mount(mnt)->mnt_ns;
1845+
umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
1846+
unlock_mount_hash();
1847+
namespace_unlock();
1848+
free_mnt_ns(ns);
1849+
}
1850+
18351851
void drop_collected_mounts(struct vfsmount *mnt)
18361852
{
18371853
namespace_lock();
@@ -2222,14 +2238,38 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
22222238
return false;
22232239
}
22242240

2241+
static struct mount *__do_loopback(struct path *old_path, int recurse)
2242+
{
2243+
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
2244+
2245+
if (IS_MNT_UNBINDABLE(old))
2246+
return mnt;
2247+
2248+
if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
2249+
return mnt;
2250+
2251+
if (!recurse && has_locked_children(old, old_path->dentry))
2252+
return mnt;
2253+
2254+
if (recurse)
2255+
mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
2256+
else
2257+
mnt = clone_mnt(old, old_path->dentry, 0);
2258+
2259+
if (!IS_ERR(mnt))
2260+
mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2261+
2262+
return mnt;
2263+
}
2264+
22252265
/*
22262266
* do loopback mount.
22272267
*/
22282268
static int do_loopback(struct path *path, const char *old_name,
22292269
int recurse)
22302270
{
22312271
struct path old_path;
2232-
struct mount *mnt = NULL, *old, *parent;
2272+
struct mount *mnt = NULL, *parent;
22332273
struct mountpoint *mp;
22342274
int err;
22352275
if (!old_name || !*old_name)
@@ -2243,38 +2283,21 @@ static int do_loopback(struct path *path, const char *old_name,
22432283
goto out;
22442284

22452285
mp = lock_mount(path);
2246-
err = PTR_ERR(mp);
2247-
if (IS_ERR(mp))
2286+
if (IS_ERR(mp)) {
2287+
err = PTR_ERR(mp);
22482288
goto out;
2289+
}
22492290

2250-
old = real_mount(old_path.mnt);
22512291
parent = real_mount(path->mnt);
2252-
2253-
err = -EINVAL;
2254-
if (IS_MNT_UNBINDABLE(old))
2255-
goto out2;
2256-
22572292
if (!check_mnt(parent))
22582293
goto out2;
22592294

2260-
if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
2261-
goto out2;
2262-
2263-
if (!recurse && has_locked_children(old, old_path.dentry))
2264-
goto out2;
2265-
2266-
if (recurse)
2267-
mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
2268-
else
2269-
mnt = clone_mnt(old, old_path.dentry, 0);
2270-
2295+
mnt = __do_loopback(&old_path, recurse);
22712296
if (IS_ERR(mnt)) {
22722297
err = PTR_ERR(mnt);
22732298
goto out2;
22742299
}
22752300

2276-
mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2277-
22782301
err = graft_tree(mnt, parent, mp);
22792302
if (err) {
22802303
lock_mount_hash();
@@ -2288,6 +2311,96 @@ static int do_loopback(struct path *path, const char *old_name,
22882311
return err;
22892312
}
22902313

2314+
static struct file *open_detached_copy(struct path *path, bool recursive)
2315+
{
2316+
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2317+
struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
2318+
struct mount *mnt, *p;
2319+
struct file *file;
2320+
2321+
if (IS_ERR(ns))
2322+
return ERR_CAST(ns);
2323+
2324+
namespace_lock();
2325+
mnt = __do_loopback(path, recursive);
2326+
if (IS_ERR(mnt)) {
2327+
namespace_unlock();
2328+
free_mnt_ns(ns);
2329+
return ERR_CAST(mnt);
2330+
}
2331+
2332+
lock_mount_hash();
2333+
for (p = mnt; p; p = next_mnt(p, mnt)) {
2334+
p->mnt_ns = ns;
2335+
ns->mounts++;
2336+
}
2337+
ns->root = mnt;
2338+
list_add_tail(&ns->list, &mnt->mnt_list);
2339+
mntget(&mnt->mnt);
2340+
unlock_mount_hash();
2341+
namespace_unlock();
2342+
2343+
mntput(path->mnt);
2344+
path->mnt = &mnt->mnt;
2345+
file = dentry_open(path, O_PATH, current_cred());
2346+
if (IS_ERR(file))
2347+
dissolve_on_fput(path->mnt);
2348+
else
2349+
file->f_mode |= FMODE_NEED_UNMOUNT;
2350+
return file;
2351+
}
2352+
2353+
SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags)
2354+
{
2355+
struct file *file;
2356+
struct path path;
2357+
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
2358+
bool detached = flags & OPEN_TREE_CLONE;
2359+
int error;
2360+
int fd;
2361+
2362+
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
2363+
2364+
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
2365+
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
2366+
OPEN_TREE_CLOEXEC))
2367+
return -EINVAL;
2368+
2369+
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
2370+
return -EINVAL;
2371+
2372+
if (flags & AT_NO_AUTOMOUNT)
2373+
lookup_flags &= ~LOOKUP_AUTOMOUNT;
2374+
if (flags & AT_SYMLINK_NOFOLLOW)
2375+
lookup_flags &= ~LOOKUP_FOLLOW;
2376+
if (flags & AT_EMPTY_PATH)
2377+
lookup_flags |= LOOKUP_EMPTY;
2378+
2379+
if (detached && !may_mount())
2380+
return -EPERM;
2381+
2382+
fd = get_unused_fd_flags(flags & O_CLOEXEC);
2383+
if (fd < 0)
2384+
return fd;
2385+
2386+
error = user_path_at(dfd, filename, lookup_flags, &path);
2387+
if (unlikely(error)) {
2388+
file = ERR_PTR(error);
2389+
} else {
2390+
if (detached)
2391+
file = open_detached_copy(&path, flags & AT_RECURSIVE);
2392+
else
2393+
file = dentry_open(&path, O_PATH, current_cred());
2394+
path_put(&path);
2395+
}
2396+
if (IS_ERR(file)) {
2397+
put_unused_fd(fd);
2398+
return PTR_ERR(file);
2399+
}
2400+
fd_install(fd, file);
2401+
return fd;
2402+
}
2403+
22912404
/*
22922405
* Don't allow locked mount flags to be cleared.
22932406
*

include/linux/fs.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
162162
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
163163

164164
/* File is capable of returning -EAGAIN if I/O will block */
165-
#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
165+
#define FMODE_NOWAIT ((__force fmode_t)0x8000000)
166+
167+
/* File represents mount that needs unmounting */
168+
#define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000)
166169

167170
/* File does not contribute to nr_files count */
168-
#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
171+
#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000)
169172

170173
/*
171174
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector

include/linux/syscalls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
985985
unsigned mask, struct statx __user *buffer);
986986
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
987987
int flags, uint32_t sig);
988+
asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
988989
asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
989990
siginfo_t __user *info,
990991
unsigned int flags);

include/uapi/linux/fcntl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,5 +91,7 @@
9191
#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
9292
#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
9393

94+
#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
95+
9496

9597
#endif /* _UAPI_LINUX_FCNTL_H */

include/uapi/linux/mount.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,10 @@
5555
#define MS_MGC_VAL 0xC0ED0000
5656
#define MS_MGC_MSK 0xffff0000
5757

58+
/*
59+
* open_tree() flags.
60+
*/
61+
#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
62+
#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
63+
5864
#endif /* _UAPI_LINUX_MOUNT_H */

0 commit comments

Comments
 (0)