Skip to content

Commit 1a0b98c

Browse files
authored
Merge pull request systemd#3589 from brauner/cgroup_namespace
Cgroup namespace
2 parents c92fcc4 + 0996ef0 commit 1a0b98c

File tree

5 files changed

+221
-28
lines changed

5 files changed

+221
-28
lines changed

src/basic/cgroup-util.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,20 @@ int cg_read_event(const char *controller, const char *path, const char *event,
134134
return -ENOENT;
135135
}
136136

137+
bool cg_ns_supported(void) {
138+
static thread_local int enabled = -1;
139+
140+
if (enabled >= 0)
141+
return enabled;
142+
143+
if (access("/proc/self/ns/cgroup", F_OK) == 0)
144+
enabled = 1;
145+
else
146+
enabled = 0;
147+
148+
return enabled;
149+
}
150+
137151
int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
138152
_cleanup_free_ char *fs = NULL;
139153
int r;

src/basic/cgroup-util.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ int cg_mask_supported(CGroupMask *ret);
222222

223223
int cg_kernel_controllers(Set *controllers);
224224

225+
bool cg_ns_supported(void);
226+
225227
int cg_unified(void);
226228
void cg_unified_flush(void);
227229

src/basic/missing.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,10 @@ struct btrfs_ioctl_quota_ctl_args {
445445
#define CGROUP2_SUPER_MAGIC 0x63677270
446446
#endif
447447

448+
#ifndef CLONE_NEWCGROUP
449+
#define CLONE_NEWCGROUP 0x02000000
450+
#endif
451+
448452
#ifndef TMPFS_MAGIC
449453
#define TMPFS_MAGIC 0x01021994
450454
#endif

src/nspawn/nspawn-mount.c

Lines changed: 172 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include "alloc-util.h"
2424
#include "cgroup-util.h"
2525
#include "escape.h"
26+
#include "fd-util.h"
27+
#include "fileio.h"
2628
#include "fs-util.h"
2729
#include "label.h"
2830
#include "mkdir.h"
@@ -181,13 +183,15 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
181183

182184
static int tmpfs_patch_options(
183185
const char *options,
184-
bool userns, uid_t uid_shift, uid_t uid_range,
186+
bool userns,
187+
uid_t uid_shift, uid_t uid_range,
188+
bool patch_ids,
185189
const char *selinux_apifs_context,
186190
char **ret) {
187191

188192
char *buf = NULL;
189193

190-
if (userns && uid_shift != 0) {
194+
if ((userns && uid_shift != 0) || patch_ids) {
191195
assert(uid_shift != UID_INVALID);
192196

193197
if (options)
@@ -218,7 +222,13 @@ static int tmpfs_patch_options(
218222
}
219223
#endif
220224

225+
if (!buf && options) {
226+
buf = strdup(options);
227+
if (!buf)
228+
return -ENOMEM;
229+
}
221230
*ret = buf;
231+
222232
return !!buf;
223233
}
224234

@@ -271,7 +281,15 @@ int mount_sysfs(const char *dest) {
271281
return log_error_errno(errno, "Failed to remove %s: %m", full);
272282

273283
x = prefix_roota(top, "/fs/kdbus");
274-
(void) mkdir(x, 0755);
284+
(void) mkdir_p(x, 0755);
285+
286+
/* Create mountpoint for cgroups. Otherwise we are not allowed since we
287+
* remount /sys read-only.
288+
*/
289+
if (cg_ns_supported()) {
290+
x = prefix_roota(top, "/fs/cgroup");
291+
(void) mkdir_p(x, 0755);
292+
}
275293

276294
if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
277295
return log_error_errno(errno, "Failed to make %s read-only: %m", top);
@@ -349,7 +367,7 @@ int mount_all(const char *dest,
349367

350368
o = mount_table[k].options;
351369
if (streq_ptr(mount_table[k].type, "tmpfs")) {
352-
r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
370+
r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
353371
if (r < 0)
354372
return log_oom();
355373
if (r > 0)
@@ -486,7 +504,7 @@ static int mount_tmpfs(
486504
if (r < 0 && r != -EEXIST)
487505
return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
488506

489-
r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
507+
r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
490508
if (r < 0)
491509
return log_oom();
492510
options = r > 0 ? buf : m->options;
@@ -601,6 +619,48 @@ int mount_custom(
601619
return 0;
602620
}
603621

622+
/* Retrieve existing subsystems. This function is called in a new cgroup
623+
* namespace.
624+
*/
625+
static int get_controllers(Set *subsystems) {
626+
_cleanup_fclose_ FILE *f = NULL;
627+
char line[LINE_MAX];
628+
629+
assert(subsystems);
630+
631+
f = fopen("/proc/self/cgroup", "re");
632+
if (!f)
633+
return errno == ENOENT ? -ESRCH : -errno;
634+
635+
FOREACH_LINE(line, f, return -errno) {
636+
int r;
637+
char *e, *l, *p;
638+
639+
truncate_nl(line);
640+
641+
l = strchr(line, ':');
642+
if (!l)
643+
continue;
644+
645+
l++;
646+
e = strchr(l, ':');
647+
if (!e)
648+
continue;
649+
650+
*e = 0;
651+
652+
if (streq(l, "") || streq(l, "name=systemd"))
653+
continue;
654+
655+
p = strdup(l);
656+
r = set_consume(subsystems, p);
657+
if (r < 0)
658+
return r;
659+
}
660+
661+
return 0;
662+
}
663+
604664
static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
605665
char *to;
606666
int r;
@@ -629,11 +689,107 @@ static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controlle
629689
return 1;
630690
}
631691

632-
static int mount_legacy_cgroups(
633-
const char *dest,
692+
/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
693+
static int mount_legacy_cgns_supported(
634694
bool userns, uid_t uid_shift, uid_t uid_range,
635695
const char *selinux_apifs_context) {
696+
_cleanup_set_free_free_ Set *controllers = NULL;
697+
const char *cgroup_root = "/sys/fs/cgroup", *c;
698+
int r;
636699

700+
(void) mkdir_p(cgroup_root, 0755);
701+
702+
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
703+
r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
704+
if (r < 0)
705+
return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
706+
if (r == 0) {
707+
_cleanup_free_ char *options = NULL;
708+
709+
/* When cgroup namespaces are enabled and user namespaces are
710+
* used then the mount of the cgroupfs is done *inside* the new
711+
* user namespace. We're root in the new user namespace and the
712+
* kernel will happily translate our uid/gid to the correct
713+
* uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
714+
* pass uid 0 and not uid_shift to tmpfs_patch_options().
715+
*/
716+
r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
717+
if (r < 0)
718+
return log_oom();
719+
720+
if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
721+
return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
722+
}
723+
724+
if (cg_unified() > 0)
725+
goto skip_controllers;
726+
727+
controllers = set_new(&string_hash_ops);
728+
if (!controllers)
729+
return log_oom();
730+
731+
r = get_controllers(controllers);
732+
if (r < 0)
733+
return log_error_errno(r, "Failed to determine cgroup controllers: %m");
734+
735+
for (;;) {
736+
_cleanup_free_ const char *controller = NULL;
737+
738+
controller = set_steal_first(controllers);
739+
if (!controller)
740+
break;
741+
742+
r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
743+
if (r < 0)
744+
return r;
745+
746+
/* When multiple hierarchies are co-mounted, make their
747+
* constituting individual hierarchies a symlink to the
748+
* co-mount.
749+
*/
750+
c = controller;
751+
for (;;) {
752+
_cleanup_free_ char *target = NULL, *tok = NULL;
753+
754+
r = extract_first_word(&c, &tok, ",", 0);
755+
if (r < 0)
756+
return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
757+
if (r == 0)
758+
break;
759+
760+
target = prefix_root("/sys/fs/cgroup", tok);
761+
if (!target)
762+
return log_oom();
763+
764+
if (streq(controller, tok))
765+
break;
766+
767+
r = symlink_idempotent(controller, target);
768+
if (r == -EINVAL)
769+
return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
770+
if (r < 0)
771+
return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
772+
}
773+
}
774+
775+
skip_controllers:
776+
r = mount_legacy_cgroup_hierarchy("", "none,name=systemd,xattr", "systemd", false);
777+
if (r < 0)
778+
return r;
779+
780+
if (!userns) {
781+
if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
782+
return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
783+
}
784+
785+
return 0;
786+
}
787+
788+
/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
789+
static int mount_legacy_cgns_unsupported(
790+
const char *dest,
791+
bool userns, uid_t uid_shift, uid_t uid_range,
792+
const char *selinux_apifs_context) {
637793
_cleanup_set_free_free_ Set *controllers = NULL;
638794
const char *cgroup_root;
639795
int r;
@@ -649,7 +805,7 @@ static int mount_legacy_cgroups(
649805
if (r == 0) {
650806
_cleanup_free_ char *options = NULL;
651807

652-
r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
808+
r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
653809
if (r < 0)
654810
return log_oom();
655811

@@ -708,10 +864,8 @@ static int mount_legacy_cgroups(
708864
return r;
709865

710866
r = symlink_idempotent(combined, target);
711-
if (r == -EINVAL) {
712-
log_error("Invalid existing symlink for combined hierarchy");
713-
return r;
714-
}
867+
if (r == -EINVAL)
868+
return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
715869
if (r < 0)
716870
return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
717871
}
@@ -766,8 +920,10 @@ int mount_cgroups(
766920

767921
if (unified_requested)
768922
return mount_unified_cgroups(dest);
769-
else
770-
return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
923+
else if (cg_ns_supported())
924+
return mount_legacy_cgns_supported(userns, uid_shift, uid_range, selinux_apifs_context);
925+
926+
return mount_legacy_cgns_unsupported(dest, userns, uid_shift, uid_range, selinux_apifs_context);
771927
}
772928

773929
int mount_systemd_cgroup_writable(
@@ -835,7 +991,7 @@ int setup_volatile_state(
835991
return log_error_errno(errno, "Failed to create %s: %m", directory);
836992

837993
options = "mode=755";
838-
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
994+
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
839995
if (r < 0)
840996
return log_oom();
841997
if (r > 0)
@@ -871,7 +1027,7 @@ int setup_volatile(
8711027
return log_error_errno(errno, "Failed to create temporary directory: %m");
8721028

8731029
options = "mode=755";
874-
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
1030+
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
8751031
if (r < 0)
8761032
return log_oom();
8771033
if (r > 0)

src/nspawn/nspawn.c

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2589,9 +2589,24 @@ static int inner_child(
25892589
return -ESRCH;
25902590
}
25912591

2592-
r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2593-
if (r < 0)
2594-
return r;
2592+
if (cg_ns_supported()) {
2593+
r = unshare(CLONE_NEWCGROUP);
2594+
if (r < 0)
2595+
return log_error_errno(errno, "Failed to unshare cgroup namespace");
2596+
r = mount_cgroups(
2597+
"",
2598+
arg_unified_cgroup_hierarchy,
2599+
arg_userns_mode != USER_NAMESPACE_NO,
2600+
arg_uid_shift,
2601+
arg_uid_range,
2602+
arg_selinux_apifs_context);
2603+
if (r < 0)
2604+
return r;
2605+
} else {
2606+
r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
2607+
if (r < 0)
2608+
return r;
2609+
}
25952610

25962611
r = reset_uid_gid();
25972612
if (r < 0)
@@ -2973,15 +2988,17 @@ static int outer_child(
29732988
if (r < 0)
29742989
return r;
29752990

2976-
r = mount_cgroups(
2977-
directory,
2978-
arg_unified_cgroup_hierarchy,
2979-
arg_userns_mode != USER_NAMESPACE_NO,
2980-
arg_uid_shift,
2981-
arg_uid_range,
2982-
arg_selinux_apifs_context);
2983-
if (r < 0)
2984-
return r;
2991+
if (!cg_ns_supported()) {
2992+
r = mount_cgroups(
2993+
directory,
2994+
arg_unified_cgroup_hierarchy,
2995+
arg_userns_mode != USER_NAMESPACE_NO,
2996+
arg_uid_shift,
2997+
arg_uid_range,
2998+
arg_selinux_apifs_context);
2999+
if (r < 0)
3000+
return r;
3001+
}
29853002

29863003
r = mount_move_root(directory);
29873004
if (r < 0)

0 commit comments

Comments
 (0)