X Tutup
Skip to content

Commit 4e39995

Browse files
committed
core: introduce ProtectProc= and ProcSubset= to expose hidepid= and subset= procfs mount options
Kernel 5.8 gained a hidepid= implementation that is truly per procfs, which allows us to mount a distinct once into every unit, with individual hidepid= settings. Let's expose this via two new settings: ProtectProc= (wrapping hidpid=) and ProcSubset= (wrapping subset=). Replaces: systemd#11670
1 parent df6b900 commit 4e39995

File tree

12 files changed

+131
-19
lines changed

12 files changed

+131
-19
lines changed

docs/TRANSIENT-SETTINGS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ All execution-related settings are available for transient units.
151151
✓ TimerSlackNSec=
152152
✓ NoNewPrivileges=
153153
✓ KeyringMode=
154+
✓ ProtectProc=
155+
✓ ProcSubset=
154156
✓ SystemCallFilter=
155157
✓ SystemCallArchitectures=
156158
✓ SystemCallErrorNumber=

src/core/dbus-execute.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_input, exec_input, ExecInp
4747
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode);
4848
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_preserve_mode, exec_preserve_mode, ExecPreserveMode);
4949
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
50+
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
51+
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
5052
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
5153
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
5254
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
@@ -1016,6 +1018,8 @@ const sd_bus_vtable bus_exec_vtable[] = {
10161018
SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
10171019
SD_BUS_PROPERTY("MountAPIVFS", "b", bus_property_get_bool, offsetof(ExecContext, mount_apivfs), SD_BUS_VTABLE_PROPERTY_CONST),
10181020
SD_BUS_PROPERTY("KeyringMode", "s", property_get_exec_keyring_mode, offsetof(ExecContext, keyring_mode), SD_BUS_VTABLE_PROPERTY_CONST),
1021+
SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST),
1022+
SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
10191023
SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
10201024
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
10211025

@@ -1354,6 +1358,8 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(utmp_mode, ExecUtmpMode, exec_utmp_mode_fr
13541358
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_system, ProtectSystem, protect_system_from_string);
13551359
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_from_string);
13561360
static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
1361+
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
1362+
static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
13571363
static BUS_DEFINE_SET_TRANSIENT_PARSE(preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
13581364
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
13591365
static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
@@ -1706,6 +1712,12 @@ int bus_exec_context_set_transient_property(
17061712
if (streq(name, "KeyringMode"))
17071713
return bus_set_transient_keyring_mode(u, name, &c->keyring_mode, message, flags, error);
17081714

1715+
if (streq(name, "ProtectProc"))
1716+
return bus_set_transient_protect_proc(u, name, &c->protect_proc, message, flags, error);
1717+
1718+
if (streq(name, "ProcSubset"))
1719+
return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
1720+
17091721
if (streq(name, "RuntimeDirectoryPreserve"))
17101722
return bus_set_transient_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
17111723

src/core/execute.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1948,7 +1948,9 @@ static bool exec_needs_mount_namespace(
19481948
context->protect_kernel_tunables ||
19491949
context->protect_kernel_modules ||
19501950
context->protect_kernel_logs ||
1951-
context->protect_control_groups)
1951+
context->protect_control_groups ||
1952+
context->protect_proc != PROTECT_PROC_DEFAULT ||
1953+
context->proc_subset != PROC_SUBSET_ALL)
19521954
return true;
19531955

19541956
if (context->root_directory) {
@@ -2652,6 +2654,8 @@ static int apply_mount_namespace(
26522654
.private_mounts = context->private_mounts,
26532655
.protect_home = context->protect_home,
26542656
.protect_system = context->protect_system,
2657+
.protect_proc = context->protect_proc,
2658+
.proc_subset = context->proc_subset,
26552659
};
26562660
} else if (!context->dynamic_user && root_dir)
26572661
/*
@@ -4601,7 +4605,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
46014605
"%sRestrictRealtime: %s\n"
46024606
"%sRestrictSUIDSGID: %s\n"
46034607
"%sKeyringMode: %s\n"
4604-
"%sProtectHostname: %s\n",
4608+
"%sProtectHostname: %s\n"
4609+
"%sProtectProc: %s\n"
4610+
"%sProcSubset: %s\n",
46054611
prefix, c->umask,
46064612
prefix, c->working_directory ? c->working_directory : "/",
46074613
prefix, c->root_directory ? c->root_directory : "/",
@@ -4623,7 +4629,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
46234629
prefix, yes_no(c->restrict_realtime),
46244630
prefix, yes_no(c->restrict_suid_sgid),
46254631
prefix, exec_keyring_mode_to_string(c->keyring_mode),
4626-
prefix, yes_no(c->protect_hostname));
4632+
prefix, yes_no(c->protect_hostname),
4633+
prefix, protect_proc_to_string(c->protect_proc),
4634+
prefix, proc_subset_to_string(c->proc_subset));
46274635

46284636
if (c->root_image)
46294637
fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);

src/core/execute.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,9 @@ struct ExecContext {
260260

261261
char *log_namespace;
262262

263+
ProtectProc protect_proc; /* hidepid= */
264+
ProcSubset proc_subset; /* subset= */
265+
263266
bool private_tmp;
264267
bool private_network;
265268
bool private_devices;

src/core/load-fragment-gperf.gperf.m4

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ $1.AmbientCapabilities, config_parse_capability_set, 0,
7373
$1.TimerSlackNSec, config_parse_nsec, 0, offsetof($1, exec_context.timer_slack_nsec)
7474
$1.NoNewPrivileges, config_parse_bool, 0, offsetof($1, exec_context.no_new_privileges)
7575
$1.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof($1, exec_context.keyring_mode)
76+
$1.ProtectProc, config_parse_protect_proc, 0, offsetof($1, exec_context.protect_proc)
77+
$1.ProcSubset, config_parse_proc_subset, 0, offsetof($1, exec_context.proc_subset)
7678
m4_ifdef(`HAVE_SECCOMP',
7779
`$1.SystemCallFilter, config_parse_syscall_filter, 0, offsetof($1, exec_context)
7880
$1.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof($1, exec_context.syscall_archs)

src/core/load-fragment.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ DEFINE_CONFIG_PARSE(config_parse_exec_secure_bits, secure_bits_from_string, "Fai
118118
DEFINE_CONFIG_PARSE_ENUM(config_parse_collect_mode, collect_mode, CollectMode, "Failed to parse garbage collection mode");
119119
DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGroupDevicePolicy, "Failed to parse device policy");
120120
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode, "Failed to parse keyring mode");
121+
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc, "Failed to parse /proc/ protection mode");
122+
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset, "Failed to parse /proc/ subset mode");
121123
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_utmp_mode, exec_utmp_mode, ExecUtmpMode, "Failed to parse utmp mode");
122124
DEFINE_CONFIG_PARSE_ENUM(config_parse_job_mode, job_mode, JobMode, "Failed to parse job mode");
123125
DEFINE_CONFIG_PARSE_ENUM(config_parse_notify_access, notify_access, NotifyAccess, "Failed to parse notify access specifier");

src/core/load-fragment.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
108108
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
109109
CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
110110
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
111+
CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
112+
CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
111113
CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
112114
CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
113115
CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);

src/core/namespace.c

Lines changed: 67 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ static const MountEntry protect_kernel_tunables_table[] = {
9797
{ "/proc/latency_stats", READONLY, true },
9898
{ "/proc/mtrr", READONLY, true },
9999
{ "/proc/scsi", READONLY, true },
100-
{ "/proc/sys", READONLY, false },
100+
{ "/proc/sys", READONLY, true },
101101
{ "/proc/sysrq-trigger", READONLY, true },
102102
{ "/proc/timer_stats", READONLY, true },
103103
{ "/sys", READONLY, false },
@@ -863,22 +863,53 @@ static int mount_sysfs(const MountEntry *m) {
863863
return 1;
864864
}
865865

866-
static int mount_procfs(const MountEntry *m) {
867-
int r;
866+
static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
867+
const char *entry_path;
868868

869869
assert(m);
870+
assert(ns_info);
870871

871-
(void) mkdir_p_label(mount_entry_path(m), 0755);
872+
entry_path = mount_entry_path(m);
872873

873-
r = path_is_mount_point(mount_entry_path(m), NULL, 0);
874-
if (r < 0)
875-
return log_debug_errno(r, "Unable to determine whether /proc is already mounted: %m");
876-
if (r > 0) /* make this a NOP if /proc is already a mount point */
877-
return 0;
874+
/* Mount a new instance, so that we get the one that matches our user namespace, if we are running in
875+
* one. i.e we don't reuse existing mounts here under any condition, we want a new instance owned by
876+
* our user namespace and with our hidepid= settings applied. Hence, let's get rid of everything
877+
* mounted on /proc/ first. */
878878

879-
/* Mount a new instance, so that we get the one that matches our user namespace, if we are running in one */
880-
if (mount("proc", mount_entry_path(m), "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
881-
return log_debug_errno(errno, "Failed to mount %s: %m", mount_entry_path(m));
879+
(void) mkdir_p_label(entry_path, 0755);
880+
(void) umount_recursive(entry_path, 0);
881+
882+
if (ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
883+
ns_info->proc_subset != PROC_SUBSET_ALL) {
884+
_cleanup_free_ char *opts = NULL;
885+
886+
/* Starting with kernel 5.8 procfs' hidepid= logic is truly per-instance (previously it
887+
* pretended to be per-instance but actually was per-namespace), hence let's make use of it
888+
* if requested. To make sure this logic succeeds only on kernels where hidepid= is
889+
* per-instance, we'll exclusively use the textual value for hidepid=, since support was
890+
* added in the same commit: if it's supported it is thus also per-instance. */
891+
892+
opts = strjoin("hidepid=",
893+
ns_info->protect_proc == PROTECT_PROC_DEFAULT ? "off" :
894+
protect_proc_to_string(ns_info->protect_proc),
895+
ns_info->proc_subset == PROC_SUBSET_PID ? ",subset=pid" : "");
896+
if (!opts)
897+
return -ENOMEM;
898+
899+
if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts) < 0) {
900+
if (errno != EINVAL)
901+
return log_debug_errno(errno, "Failed to mount %s (options=%s): %m", mount_entry_path(m), opts);
902+
903+
/* If this failed with EINVAL then this likely means the textual hidepid= stuff is
904+
* not supported by the kernel, and thus the per-instance hidepid= neither, which
905+
* means we really don't want to use it, since it would affect our host's /proc
906+
* mount. Hence let's gracefully fallback to a classic, unrestricted version. */
907+
} else
908+
return 1;
909+
}
910+
911+
if (mount("proc", entry_path, "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
912+
return log_debug_errno(errno, "Failed to mount %s (no options): %m", mount_entry_path(m));
882913

883914
return 1;
884915
}
@@ -997,14 +1028,16 @@ static int follow_symlink(
9971028

9981029
static int apply_mount(
9991030
const char *root_directory,
1000-
MountEntry *m) {
1031+
MountEntry *m,
1032+
const NamespaceInfo *ns_info) {
10011033

10021034
_cleanup_free_ char *inaccessible = NULL;
10031035
bool rbind = true, make = false;
10041036
const char *what;
10051037
int r;
10061038

10071039
assert(m);
1040+
assert(ns_info);
10081041

10091042
log_debug("Applying namespace mount on %s", mount_entry_path(m));
10101043

@@ -1109,7 +1142,7 @@ static int apply_mount(
11091142
return mount_sysfs(m);
11101143

11111144
case PROCFS:
1112-
return mount_procfs(m);
1145+
return mount_procfs(m, ns_info);
11131146

11141147
case MOUNT_IMAGES:
11151148
return mount_images(m);
@@ -1221,7 +1254,9 @@ static bool namespace_info_mount_apivfs(const NamespaceInfo *ns_info) {
12211254

12221255
return ns_info->mount_apivfs ||
12231256
ns_info->protect_control_groups ||
1224-
ns_info->protect_kernel_tunables;
1257+
ns_info->protect_kernel_tunables ||
1258+
ns_info->protect_proc != PROTECT_PROC_DEFAULT ||
1259+
ns_info->proc_subset != PROC_SUBSET_ALL;
12251260
}
12261261

12271262
static size_t namespace_calculate_mounts(
@@ -1717,7 +1752,7 @@ int setup_namespace(
17171752
break;
17181753
}
17191754

1720-
r = apply_mount(root, m);
1755+
r = apply_mount(root, m, ns_info);
17211756
if (r < 0) {
17221757
if (error_path && mount_entry_path(m))
17231758
*error_path = strdup(mount_entry_path(m));
@@ -2237,3 +2272,19 @@ static const char* const namespace_type_table[] = {
22372272
};
22382273

22392274
DEFINE_STRING_TABLE_LOOKUP(namespace_type, NamespaceType);
2275+
2276+
static const char* const protect_proc_table[_PROTECT_PROC_MAX] = {
2277+
[PROTECT_PROC_DEFAULT] = "default",
2278+
[PROTECT_PROC_NOACCESS] = "noaccess",
2279+
[PROTECT_PROC_INVISIBLE] = "invisible",
2280+
[PROTECT_PROC_PTRACEABLE] = "ptraceable",
2281+
};
2282+
2283+
DEFINE_STRING_TABLE_LOOKUP(protect_proc, ProtectProc);
2284+
2285+
static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
2286+
[PROC_SUBSET_ALL] = "all",
2287+
[PROC_SUBSET_PID] = "pid",
2288+
};
2289+
2290+
DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);

src/core/namespace.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,22 @@ typedef enum ProtectSystem {
4747
_PROTECT_SYSTEM_INVALID = -1
4848
} ProtectSystem;
4949

50+
typedef enum ProtectProc {
51+
PROTECT_PROC_DEFAULT,
52+
PROTECT_PROC_NOACCESS, /* hidepid=noaccess */
53+
PROTECT_PROC_INVISIBLE, /* hidepid=invisible */
54+
PROTECT_PROC_PTRACEABLE, /* hidepid=ptraceable */
55+
_PROTECT_PROC_MAX,
56+
_PROTECT_PROC_INVALID = -1,
57+
} ProtectProc;
58+
59+
typedef enum ProcSubset {
60+
PROC_SUBSET_ALL,
61+
PROC_SUBSET_PID, /* subset=pid */
62+
_PROC_SUBSET_MAX,
63+
_PROC_SUBSET_INVALID = -1,
64+
} ProcSubset;
65+
5066
struct NamespaceInfo {
5167
bool ignore_protect_paths:1;
5268
bool private_dev:1;
@@ -59,6 +75,8 @@ struct NamespaceInfo {
5975
bool protect_hostname:1;
6076
ProtectHome protect_home;
6177
ProtectSystem protect_system;
78+
ProtectProc protect_proc;
79+
ProcSubset proc_subset;
6280
};
6381

6482
struct BindMount {
@@ -135,6 +153,12 @@ ProtectHome protect_home_from_string(const char *s) _pure_;
135153
const char* protect_system_to_string(ProtectSystem p) _const_;
136154
ProtectSystem protect_system_from_string(const char *s) _pure_;
137155

156+
const char* protect_proc_to_string(ProtectProc i) _const_;
157+
ProtectProc protect_proc_from_string(const char *s) _pure_;
158+
159+
const char* proc_subset_to_string(ProcSubset i) _const_;
160+
ProcSubset proc_subset_from_string(const char *s) _pure_;
161+
138162
void bind_mount_free_many(BindMount *b, size_t n);
139163
int bind_mount_add(BindMount **b, size_t *n, const BindMount *item);
140164

src/shared/bus-unit-util.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -855,6 +855,8 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
855855
"RuntimeDirectoryPreserve",
856856
"Personality",
857857
"KeyringMode",
858+
"ProtectProc",
859+
"ProcSubset",
858860
"NetworkNamespacePath",
859861
"LogNamespace"))
860862
return bus_append_string(m, field, eq);

0 commit comments

Comments
 (0)
X Tutup