@@ -97,7 +97,7 @@ static const MountEntry protect_kernel_tunables_table[] = {
9797 { "/proc/latency_stats" , READONLY , true },
9898 { "/proc/mtrr" , READONLY , true },
9999 { "/proc/scsi" , READONLY , true },
100- { "/proc/sys" , READONLY , false },
100+ { "/proc/sys" , READONLY , true },
101101 { "/proc/sysrq-trigger" , READONLY , true },
102102 { "/proc/timer_stats" , READONLY , true },
103103 { "/sys" , READONLY , false },
@@ -863,22 +863,53 @@ static int mount_sysfs(const MountEntry *m) {
863863 return 1 ;
864864}
865865
866- static int mount_procfs (const MountEntry * m ) {
867- int r ;
866+ static int mount_procfs (const MountEntry * m , const NamespaceInfo * ns_info ) {
867+ const char * entry_path ;
868868
869869 assert (m );
870+ assert (ns_info );
870871
871- ( void ) mkdir_p_label ( mount_entry_path (m ), 0755 );
872+ entry_path = mount_entry_path (m );
872873
873- r = path_is_mount_point (mount_entry_path (m ), NULL , 0 );
874- if (r < 0 )
875- return log_debug_errno (r , "Unable to determine whether /proc is already mounted: %m" );
876- if (r > 0 ) /* make this a NOP if /proc is already a mount point */
877- return 0 ;
874+ /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
875+ * one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
876+ * our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
877+ * mounted on /proc/ first. */
878878
879- /* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
880- if (mount ("proc" , mount_entry_path (m ), "proc" , MS_NOSUID |MS_NOEXEC |MS_NODEV , NULL ) < 0 )
881- return log_debug_errno (errno , "Failed to mount %s: %m" , mount_entry_path (m ));
879+ (void ) mkdir_p_label (entry_path , 0755 );
880+ (void ) umount_recursive (entry_path , 0 );
881+
882+ if (ns_info -> protect_proc != PROTECT_PROC_DEFAULT ||
883+ ns_info -> proc_subset != PROC_SUBSET_ALL ) {
884+ _cleanup_free_ char * opts = NULL ;
885+
886+ /* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
887+ * pretended to be per-instance but actually was per-namespace), hence let's make use of it
888+ * if requested. To make sure this logic succeeds only on kernels where hidepid= is
889+ * per-instance, we'll exclusively use the textual value for hidepid=, since support was
890+ * added in the same commit: if it's supported it is thus also per-instance. */
891+
892+ opts = strjoin ("hidepid=" ,
893+ ns_info -> protect_proc == PROTECT_PROC_DEFAULT ? "off" :
894+ protect_proc_to_string (ns_info -> protect_proc ),
895+ ns_info -> proc_subset == PROC_SUBSET_PID ? ",subset=pid" : "" );
896+ if (!opts )
897+ return - ENOMEM ;
898+
899+ if (mount ("proc" , entry_path , "proc" , MS_NOSUID |MS_NOEXEC |MS_NODEV , opts ) < 0 ) {
900+ if (errno != EINVAL )
901+ return log_debug_errno (errno , "Failed to mount %s (options=%s): %m" , mount_entry_path (m ), opts );
902+
903+ /* If this failed with EINVAL then this likely means the textual hidepid= stuff is
904+ * not supported by the kernel, and thus the per-instance hidepid= neither, which
905+ * means we really don't want to use it, since it would affect our host's /proc
906+ * mount. Hence let's gracefully fallback to a classic, unrestricted version. */
907+ } else
908+ return 1 ;
909+ }
910+
911+ if (mount ("proc" , entry_path , "proc" , MS_NOSUID |MS_NOEXEC |MS_NODEV , NULL ) < 0 )
912+ return log_debug_errno (errno , "Failed to mount %s (no options): %m" , mount_entry_path (m ));
882913
883914 return 1 ;
884915}
@@ -997,14 +1028,16 @@ static int follow_symlink(
9971028
9981029static int apply_mount (
9991030 const char * root_directory ,
1000- MountEntry * m ) {
1031+ MountEntry * m ,
1032+ const NamespaceInfo * ns_info ) {
10011033
10021034 _cleanup_free_ char * inaccessible = NULL ;
10031035 bool rbind = true, make = false;
10041036 const char * what ;
10051037 int r ;
10061038
10071039 assert (m );
1040+ assert (ns_info );
10081041
10091042 log_debug ("Applying namespace mount on %s" , mount_entry_path (m ));
10101043
@@ -1109,7 +1142,7 @@ static int apply_mount(
11091142 return mount_sysfs (m );
11101143
11111144 case PROCFS :
1112- return mount_procfs (m );
1145+ return mount_procfs (m , ns_info );
11131146
11141147 case MOUNT_IMAGES :
11151148 return mount_images (m );
@@ -1221,7 +1254,9 @@ static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
12211254
12221255 return ns_info -> mount_apivfs ||
12231256 ns_info -> protect_control_groups ||
1224- ns_info -> protect_kernel_tunables ;
1257+ ns_info -> protect_kernel_tunables ||
1258+ ns_info -> protect_proc != PROTECT_PROC_DEFAULT ||
1259+ ns_info -> proc_subset != PROC_SUBSET_ALL ;
12251260}
12261261
12271262static size_t namespace_calculate_mounts (
@@ -1717,7 +1752,7 @@ int setup_namespace(
17171752 break ;
17181753 }
17191754
1720- r = apply_mount (root , m );
1755+ r = apply_mount (root , m , ns_info );
17211756 if (r < 0 ) {
17221757 if (error_path && mount_entry_path (m ))
17231758 * error_path = strdup (mount_entry_path (m ));
@@ -2237,3 +2272,19 @@ static const char* const namespace_type_table[] = {
22372272};
22382273
22392274DEFINE_STRING_TABLE_LOOKUP (namespace_type , NamespaceType );
2275+
2276+ static const char * const protect_proc_table [_PROTECT_PROC_MAX ] = {
2277+ [PROTECT_PROC_DEFAULT ] = "default" ,
2278+ [PROTECT_PROC_NOACCESS ] = "noaccess" ,
2279+ [PROTECT_PROC_INVISIBLE ] = "invisible" ,
2280+ [PROTECT_PROC_PTRACEABLE ] = "ptraceable" ,
2281+ };
2282+
2283+ DEFINE_STRING_TABLE_LOOKUP (protect_proc , ProtectProc );
2284+
2285+ static const char * const proc_subset_table [_PROC_SUBSET_MAX ] = {
2286+ [PROC_SUBSET_ALL ] = "all" ,
2287+ [PROC_SUBSET_PID ] = "pid" ,
2288+ };
2289+
2290+ DEFINE_STRING_TABLE_LOOKUP (proc_subset , ProcSubset );
0 commit comments