2323#include "alloc-util.h"
2424#include "cgroup-util.h"
2525#include "escape.h"
26+ #include "fd-util.h"
27+ #include "fileio.h"
2628#include "fs-util.h"
2729#include "label.h"
2830#include "mkdir.h"
@@ -181,13 +183,15 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
181183
182184static int tmpfs_patch_options (
183185 const char * options ,
184- bool userns , uid_t uid_shift , uid_t uid_range ,
186+ bool userns ,
187+ uid_t uid_shift , uid_t uid_range ,
188+ bool patch_ids ,
185189 const char * selinux_apifs_context ,
186190 char * * ret ) {
187191
188192 char * buf = NULL ;
189193
190- if (userns && uid_shift != 0 ) {
194+ if (( userns && uid_shift != 0 ) || patch_ids ) {
191195 assert (uid_shift != UID_INVALID );
192196
193197 if (options )
@@ -218,7 +222,13 @@ static int tmpfs_patch_options(
218222 }
219223#endif
220224
225+ if (!buf && options ) {
226+ buf = strdup (options );
227+ if (!buf )
228+ return - ENOMEM ;
229+ }
221230 * ret = buf ;
231+
222232 return !!buf ;
223233}
224234
@@ -271,7 +281,15 @@ int mount_sysfs(const char *dest) {
271281 return log_error_errno (errno , "Failed to remove %s: %m" , full );
272282
273283 x = prefix_roota (top , "/fs/kdbus" );
274- (void ) mkdir (x , 0755 );
284+ (void ) mkdir_p (x , 0755 );
285+
286+ /* Create mountpoint for cgroups. Otherwise we are not allowed since we
287+ * remount /sys read-only.
288+ */
289+ if (cg_ns_supported ()) {
290+ x = prefix_roota (top , "/fs/cgroup" );
291+ (void ) mkdir_p (x , 0755 );
292+ }
275293
276294 if (mount (NULL , top , NULL , MS_BIND |MS_RDONLY |MS_NOSUID |MS_NOEXEC |MS_NODEV |MS_REMOUNT , NULL ) < 0 )
277295 return log_error_errno (errno , "Failed to make %s read-only: %m" , top );
@@ -349,7 +367,7 @@ int mount_all(const char *dest,
349367
350368 o = mount_table [k ].options ;
351369 if (streq_ptr (mount_table [k ].type , "tmpfs" )) {
352- r = tmpfs_patch_options (o , use_userns , uid_shift , uid_range , selinux_apifs_context , & options );
370+ r = tmpfs_patch_options (o , use_userns , uid_shift , uid_range , false, selinux_apifs_context , & options );
353371 if (r < 0 )
354372 return log_oom ();
355373 if (r > 0 )
@@ -486,7 +504,7 @@ static int mount_tmpfs(
486504 if (r < 0 && r != - EEXIST )
487505 return log_error_errno (r , "Creating mount point for tmpfs %s failed: %m" , where );
488506
489- r = tmpfs_patch_options (m -> options , userns , uid_shift , uid_range , selinux_apifs_context , & buf );
507+ r = tmpfs_patch_options (m -> options , userns , uid_shift , uid_range , false, selinux_apifs_context , & buf );
490508 if (r < 0 )
491509 return log_oom ();
492510 options = r > 0 ? buf : m -> options ;
@@ -601,6 +619,48 @@ int mount_custom(
601619 return 0 ;
602620}
603621
622+ /* Retrieve existing subsystems. This function is called in a new cgroup
623+ * namespace.
624+ */
625+ static int get_controllers (Set * subsystems ) {
626+ _cleanup_fclose_ FILE * f = NULL ;
627+ char line [LINE_MAX ];
628+
629+ assert (subsystems );
630+
631+ f = fopen ("/proc/self/cgroup" , "re" );
632+ if (!f )
633+ return errno == ENOENT ? - ESRCH : - errno ;
634+
635+ FOREACH_LINE (line , f , return - errno ) {
636+ int r ;
637+ char * e , * l , * p ;
638+
639+ truncate_nl (line );
640+
641+ l = strchr (line , ':' );
642+ if (!l )
643+ continue ;
644+
645+ l ++ ;
646+ e = strchr (l , ':' );
647+ if (!e )
648+ continue ;
649+
650+ * e = 0 ;
651+
652+ if (streq (l , "" ) || streq (l , "name=systemd" ))
653+ continue ;
654+
655+ p = strdup (l );
656+ r = set_consume (subsystems , p );
657+ if (r < 0 )
658+ return r ;
659+ }
660+
661+ return 0 ;
662+ }
663+
604664static int mount_legacy_cgroup_hierarchy (const char * dest , const char * controller , const char * hierarchy , bool read_only ) {
605665 char * to ;
606666 int r ;
@@ -629,11 +689,107 @@ static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controlle
629689 return 1 ;
630690}
631691
632- static int mount_legacy_cgroups (
633- const char * dest ,
692+ /* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
693+ static int mount_legacy_cgns_supported (
634694 bool userns , uid_t uid_shift , uid_t uid_range ,
635695 const char * selinux_apifs_context ) {
696+ _cleanup_set_free_free_ Set * controllers = NULL ;
697+ const char * cgroup_root = "/sys/fs/cgroup" , * c ;
698+ int r ;
636699
700+ (void ) mkdir_p (cgroup_root , 0755 );
701+
702+ /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
703+ r = path_is_mount_point (cgroup_root , AT_SYMLINK_FOLLOW );
704+ if (r < 0 )
705+ return log_error_errno (r , "Failed to determine if /sys/fs/cgroup is already mounted: %m" );
706+ if (r == 0 ) {
707+ _cleanup_free_ char * options = NULL ;
708+
709+ /* When cgroup namespaces are enabled and user namespaces are
710+ * used then the mount of the cgroupfs is done *inside* the new
711+ * user namespace. We're root in the new user namespace and the
712+ * kernel will happily translate our uid/gid to the correct
713+ * uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
714+ * pass uid 0 and not uid_shift to tmpfs_patch_options().
715+ */
716+ r = tmpfs_patch_options ("mode=755" , userns , 0 , uid_range , true, selinux_apifs_context , & options );
717+ if (r < 0 )
718+ return log_oom ();
719+
720+ if (mount ("tmpfs" , cgroup_root , "tmpfs" , MS_NOSUID |MS_NOEXEC |MS_NODEV |MS_STRICTATIME , options ) < 0 )
721+ return log_error_errno (errno , "Failed to mount /sys/fs/cgroup: %m" );
722+ }
723+
724+ if (cg_unified () > 0 )
725+ goto skip_controllers ;
726+
727+ controllers = set_new (& string_hash_ops );
728+ if (!controllers )
729+ return log_oom ();
730+
731+ r = get_controllers (controllers );
732+ if (r < 0 )
733+ return log_error_errno (r , "Failed to determine cgroup controllers: %m" );
734+
735+ for (;;) {
736+ _cleanup_free_ const char * controller = NULL ;
737+
738+ controller = set_steal_first (controllers );
739+ if (!controller )
740+ break ;
741+
742+ r = mount_legacy_cgroup_hierarchy ("" , controller , controller , !userns );
743+ if (r < 0 )
744+ return r ;
745+
746+ /* When multiple hierarchies are co-mounted, make their
747+ * constituting individual hierarchies a symlink to the
748+ * co-mount.
749+ */
750+ c = controller ;
751+ for (;;) {
752+ _cleanup_free_ char * target = NULL , * tok = NULL ;
753+
754+ r = extract_first_word (& c , & tok , "," , 0 );
755+ if (r < 0 )
756+ return log_error_errno (r , "Failed to extract co-mounted cgroup controller: %m" );
757+ if (r == 0 )
758+ break ;
759+
760+ target = prefix_root ("/sys/fs/cgroup" , tok );
761+ if (!target )
762+ return log_oom ();
763+
764+ if (streq (controller , tok ))
765+ break ;
766+
767+ r = symlink_idempotent (controller , target );
768+ if (r == - EINVAL )
769+ return log_error_errno (r , "Invalid existing symlink for combined hierarchy: %m" );
770+ if (r < 0 )
771+ return log_error_errno (r , "Failed to create symlink for combined hierarchy: %m" );
772+ }
773+ }
774+
775+ skip_controllers :
776+ r = mount_legacy_cgroup_hierarchy ("" , "none,name=systemd,xattr" , "systemd" , false);
777+ if (r < 0 )
778+ return r ;
779+
780+ if (!userns ) {
781+ if (mount (NULL , cgroup_root , NULL , MS_REMOUNT |MS_NOSUID |MS_NOEXEC |MS_NODEV |MS_STRICTATIME |MS_RDONLY , "mode=755" ) < 0 )
782+ return log_error_errno (errno , "Failed to remount %s read-only: %m" , cgroup_root );
783+ }
784+
785+ return 0 ;
786+ }
787+
788+ /* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
789+ static int mount_legacy_cgns_unsupported (
790+ const char * dest ,
791+ bool userns , uid_t uid_shift , uid_t uid_range ,
792+ const char * selinux_apifs_context ) {
637793 _cleanup_set_free_free_ Set * controllers = NULL ;
638794 const char * cgroup_root ;
639795 int r ;
@@ -649,7 +805,7 @@ static int mount_legacy_cgroups(
649805 if (r == 0 ) {
650806 _cleanup_free_ char * options = NULL ;
651807
652- r = tmpfs_patch_options ("mode=755" , userns , uid_shift , uid_range , selinux_apifs_context , & options );
808+ r = tmpfs_patch_options ("mode=755" , userns , uid_shift , uid_range , false, selinux_apifs_context , & options );
653809 if (r < 0 )
654810 return log_oom ();
655811
@@ -708,10 +864,8 @@ static int mount_legacy_cgroups(
708864 return r ;
709865
710866 r = symlink_idempotent (combined , target );
711- if (r == - EINVAL ) {
712- log_error ("Invalid existing symlink for combined hierarchy" );
713- return r ;
714- }
867+ if (r == - EINVAL )
868+ return log_error_errno (r , "Invalid existing symlink for combined hierarchy: %m" );
715869 if (r < 0 )
716870 return log_error_errno (r , "Failed to create symlink for combined hierarchy: %m" );
717871 }
@@ -766,8 +920,10 @@ int mount_cgroups(
766920
767921 if (unified_requested )
768922 return mount_unified_cgroups (dest );
769- else
770- return mount_legacy_cgroups (dest , userns , uid_shift , uid_range , selinux_apifs_context );
923+ else if (cg_ns_supported ())
924+ return mount_legacy_cgns_supported (userns , uid_shift , uid_range , selinux_apifs_context );
925+
926+ return mount_legacy_cgns_unsupported (dest , userns , uid_shift , uid_range , selinux_apifs_context );
771927}
772928
773929int mount_systemd_cgroup_writable (
@@ -835,7 +991,7 @@ int setup_volatile_state(
835991 return log_error_errno (errno , "Failed to create %s: %m" , directory );
836992
837993 options = "mode=755" ;
838- r = tmpfs_patch_options (options , userns , uid_shift , uid_range , selinux_apifs_context , & buf );
994+ r = tmpfs_patch_options (options , userns , uid_shift , uid_range , false, selinux_apifs_context , & buf );
839995 if (r < 0 )
840996 return log_oom ();
841997 if (r > 0 )
@@ -871,7 +1027,7 @@ int setup_volatile(
8711027 return log_error_errno (errno , "Failed to create temporary directory: %m" );
8721028
8731029 options = "mode=755" ;
874- r = tmpfs_patch_options (options , userns , uid_shift , uid_range , selinux_apifs_context , & buf );
1030+ r = tmpfs_patch_options (options , userns , uid_shift , uid_range , false, selinux_apifs_context , & buf );
8751031 if (r < 0 )
8761032 return log_oom ();
8771033 if (r > 0 )
0 commit comments