X Tutup
Skip to content

Commit 35d20c4

Browse files
chown cgroup to process uid in container namespace
Delegating cgroups to the container enables more complex workloads, including systemd-based workloads. The OCI runtime-spec was recently updated to explicitly admit such delegation, through specification of cgroup ownership semantics: opencontainers/runtime-spec#1123 Pursuant to the updated OCI runtime-spec, change the ownership of the container's cgroup directory and particular files therein, when using cgroups v2 and when the cgroupfs is to be mounted read/write. As a result of this change, systemd workloads can run in isolated user namespaces on OpenShift when the sandbox's cgroupfs is mounted read/write. It might be possible to implement this feature in other cgroup managers, but that work is deferred. Signed-off-by: Fraser Tweedale <ftweedal@redhat.com>
1 parent 6ff0420 commit 35d20c4

File tree

4 files changed

+150
-0
lines changed

4 files changed

+150
-0
lines changed

libcontainer/cgroups/systemd/v2.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package systemd
22

33
import (
4+
"bufio"
45
"fmt"
56
"math"
7+
"os"
68
"path/filepath"
79
"strconv"
810
"strings"
@@ -288,9 +290,46 @@ func (m *unifiedManager) Apply(pid int) error {
288290
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
289291
return err
290292
}
293+
294+
if c.OwnerUID != nil {
295+
filesToChown, err := cgroupFilesToChown()
296+
if err != nil {
297+
return err
298+
}
299+
300+
for _, v := range filesToChown {
301+
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
302+
if err != nil {
303+
return err
304+
}
305+
}
306+
}
307+
291308
return nil
292309
}
293310

311+
// The kernel exposes a list of files that should be chowned to the delegate
312+
// uid in /sys/kernel/cgroup/delegate. If the file is not present
313+
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
314+
func cgroupFilesToChown() ([]string, error) {
315+
filesToChown := []string{"."} // the directory itself must be chowned
316+
const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
317+
f, err := os.Open(cgroupDelegateFile)
318+
if err == nil {
319+
defer f.Close()
320+
scanner := bufio.NewScanner(f)
321+
for scanner.Scan() {
322+
filesToChown = append(filesToChown, scanner.Text())
323+
}
324+
if err := scanner.Err(); err != nil {
325+
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
326+
}
327+
} else {
328+
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
329+
}
330+
return filesToChown, nil
331+
}
332+
294333
func (m *unifiedManager) Destroy() error {
295334
m.mu.Lock()
296335
defer m.mu.Unlock()

libcontainer/configs/cgroup_linux.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ type Cgroup struct {
4141

4242
// Rootless tells if rootless cgroups should be used.
4343
Rootless bool
44+
45+
// The host UID that should own the cgroup, or nil to accept
46+
// the default ownership. This should only be set when the
47+
// cgroupfs is to be mounted read/write.
48+
// Not all cgroup manager implementations support changing
49+
// the ownership.
50+
OwnerUID *int `json:"owner_uid,omitempty"`
4451
}
4552

4653
type Resources struct {

libcontainer/specconv/spec_linux.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
366366
}
367367
}
368368
}
369+
370+
// Set the host UID that should own the container's cgroup.
371+
// This must be performed after setupUserNamespace, so that
372+
// config.HostRootUID() returns the correct result.
373+
//
374+
// Only set it if the container will have its own cgroup
375+
// namespace and the cgroupfs will be mounted read/write.
376+
//
377+
hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == ""
378+
hasRwCgroupfs := false
379+
if hasCgroupNS {
380+
for _, m := range config.Mounts {
381+
if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 {
382+
hasRwCgroupfs = true
383+
break
384+
}
385+
}
386+
}
387+
processUid := 0
388+
if spec.Process != nil {
389+
// Chown the cgroup to the UID running the process,
390+
// which is not necessarily UID 0 in the container
391+
// namespace (e.g., an unprivileged UID in the host
392+
// user namespace).
393+
processUid = int(spec.Process.User.UID)
394+
}
395+
if hasCgroupNS && hasRwCgroupfs {
396+
ownerUid, err := config.HostUID(processUid)
397+
// There are two error cases; we can ignore both.
398+
//
399+
// 1. uidMappings is unset. Either there is no user
400+
// namespace (fine), or it is an error (which is
401+
// checked elsewhere).
402+
//
403+
// 2. The user is unmapped in the user namespace. This is an
404+
// unusual configuration and might be an error. But it too
405+
// will be checked elsewhere, so we can ignore it here.
406+
//
407+
if err == nil {
408+
config.Cgroups.OwnerUID = &ownerUid
409+
}
410+
}
411+
369412
if spec.Process != nil {
370413
config.OomScoreAdj = spec.Process.OOMScoreAdj
371414
config.NoNewPrivileges = spec.Process.NoNewPrivileges
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/usr/bin/env bats
2+
3+
load helpers
4+
5+
function teardown() {
6+
teardown_bundle
7+
}
8+
9+
function setup() {
10+
requires root cgroups_v2 systemd
11+
12+
setup_busybox
13+
14+
# chown test temp dir to allow host user to read it
15+
chown 100000 "$ROOT"
16+
17+
# chown rootfs to allow host user to mkdir mount points
18+
chown 100000 "$ROOT"/bundle/rootfs
19+
20+
set_cgroups_path
21+
22+
# configure a user namespace
23+
update_config ' .linux.namespaces += [{"type": "user"}]
24+
| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
25+
| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
26+
'
27+
}
28+
29+
@test "runc exec (cgroup v2, ro cgroupfs, new cgroupns) does not chown cgroup" {
30+
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
31+
[ "$status" -eq 0 ]
32+
33+
runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
34+
[ "$status" -eq 0 ]
35+
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
36+
}
37+
38+
@test "runc exec (cgroup v2, rw cgroupfs, inh cgroupns) does not chown cgroup" {
39+
set_cgroup_mount_writable
40+
41+
# inherit cgroup namespace (remove cgroup from namespaces list)
42+
update_config '.linux.namespaces |= map(select(.type != "cgroup"))'
43+
44+
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
45+
[ "$status" -eq 0 ]
46+
47+
runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
48+
[ "$status" -eq 0 ]
49+
[ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user
50+
}
51+
52+
@test "runc exec (cgroup v2, rw cgroupfs, new cgroupns) does chown cgroup" {
53+
set_cgroup_mount_writable
54+
55+
runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown
56+
[ "$status" -eq 0 ]
57+
58+
runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup"
59+
[ "$status" -eq 0 ]
60+
[ "$output" = "root" ] # /sys/fs/cgroup owned by root (of user namespace)
61+
}

0 commit comments

Comments
 (0)
X Tutup