diff --git a/background/process.go b/background/process.go index d6d52b4..9f30e0f 100644 --- a/background/process.go +++ b/background/process.go @@ -34,8 +34,11 @@ type Process struct { mu sync.Mutex } -// New returns an instantiated daemon. -func New(ctx context.Context, log slog.Logger, cmd string, args ...string) *Process { +// New returns an instantiated daemon. binName is the expected +// /proc//cmdline value used for exit detection; pass cmd for plain +// invocations, or the post-exec binary name for exec wrappers (e.g. +// "dockerd" when cmd is "unshare ... -- sh -c 'exec dockerd ...'"). +func New(ctx context.Context, log slog.Logger, binName, cmd string, args ...string) *Process { ctx, cancel := context.WithCancel(ctx) return &Process{ ctx: ctx, @@ -44,7 +47,7 @@ func New(ctx context.Context, log slog.Logger, cmd string, args ...string) *Proc cmd: xunix.GetExecer(ctx).CommandContext(ctx, cmd, args...), log: log.Named(cmd), userKilled: i64ptr(0), - binName: cmd, + binName: binName, } } @@ -80,8 +83,8 @@ func (d *Process) Run() <-chan error { } // Restart kill the running process and reruns the command with the updated -// cmd and args. -func (d *Process) Restart(ctx context.Context, cmd string, args ...string) error { +// binName, cmd and args. See New for the meaning of binName. +func (d *Process) Restart(ctx context.Context, binName, cmd string, args ...string) error { d.mu.Lock() defer d.mu.Unlock() @@ -96,7 +99,7 @@ func (d *Process) Restart(ctx context.Context, cmd string, args ...string) error d.cmd = xunix.GetExecer(ctx).CommandContext(ctx, cmd, args...) d.waitCh = make(chan error, 1) d.userKilled = i64ptr(0) - d.binName = cmd + d.binName = binName return d.startProcess() } diff --git a/cli/docker.go b/cli/docker.go index c7da6fc..469bc69 100644 --- a/cli/docker.go +++ b/cli/docker.go @@ -2,6 +2,7 @@ package cli import ( "context" + _ "embed" "encoding/json" "fmt" "io" @@ -225,14 +226,14 @@ func dockerCmd() *cobra.Command { select { // Start sysbox-mgr and sysbox-fs in order to run // sysbox containers. - case err := <-background.New(ctx, log, "sysbox-mgr", sysboxArgs...).Run(): + case err := <-background.New(ctx, log, "sysbox-mgr", "sysbox-mgr", sysboxArgs...).Run(): if ctx.Err() == nil { blog.Info(sysboxErrMsg) //nolint log.Critical(ctx, "sysbox-mgr exited", slog.Error(err)) panic(err) } - case err := <-background.New(ctx, log, "sysbox-fs").Run(): + case err := <-background.New(ctx, log, "sysbox-fs", "sysbox-fs").Run(): if ctx.Err() == nil { blog.Info(sysboxErrMsg) //nolint @@ -256,7 +257,8 @@ func dockerCmd() *cobra.Command { log.Debug(ctx, "starting dockerd", slog.F("args", args)) blog.Info("Waiting for sysbox processes to startup...") - dockerd := background.New(ctx, log, "dockerd", dargs...) + wrapCmd, wrapArgs := wrapDockerdCmd(dargs) + dockerd := background.New(ctx, log, dockerdBinName, wrapCmd, wrapArgs...) err = dockerd.Start() if err != nil { return xerrors.Errorf("start dockerd: %w", err) @@ -289,7 +291,8 @@ func dockerCmd() *cobra.Command { log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err)) } - err = dockerd.Restart(ctx, "dockerd", args...) + wrapCmd, wrapArgs := wrapDockerdCmd(args) + err = dockerd.Restart(ctx, dockerdBinName, wrapCmd, wrapArgs...) if err != nil { blog.Info("Failed to create Container-based Virtual Machine: " + err.Error()) //nolint @@ -357,7 +360,8 @@ func dockerCmd() *cobra.Command { log.Debug(ctx, "restarting dockerd", slog.F("args", args)) - err = dockerd.Restart(ctx, "dockerd", args...) + wrapCmd, wrapArgs := wrapDockerdCmd(args) + err = dockerd.Restart(ctx, dockerdBinName, wrapCmd, wrapArgs...) if err != nil { return xerrors.Errorf("restart dockerd: %w", err) } @@ -881,6 +885,41 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) { return args, nil } +// dockerdBinName is the post-exec cmdline of the wrapped dockerd (unshare -> +// sh -> dockerd), used for background.Process exit detection. +const dockerdBinName = "dockerd" + +// dockerdSubtreeControlMaxAttempts bounds the cgroup-subtree-control retry +// loop in wrap_dockerd.sh. Diverges from moby's hack/dind (unbounded). +const dockerdSubtreeControlMaxAttempts = 100 + +//go:embed wrap_dockerd.sh +var wrapDockerdScript string + +// wrapDockerdCmd wraps dockerd with `unshare --cgroup` + cgroup2 remount and +// delegation (see wrap_dockerd.sh) so inner container cgroups become +// descendants of the envbox container's own cgroup on the host, restoring +// pod attribution for cgroup-aware tools (Tetragon, Falco, etc.). +// +// We do NOT pass --mount on unshare: the remount intentionally leaks into +// envbox's mount namespace so sysbox-fs's /var/lib/sysboxfs/ mounts stay +// visible to sysbox-runc. xunix.readCPUQuotaCGroupV2 has a fallback for +// the resulting cpu.max path change. +// +// See: https://github.com/moby/moby/issues/45378#issuecomment-2886261231 +func wrapDockerdCmd(dargs []string) (string, []string) { + shellCmd := fmt.Sprintf("envbox_max_attempts=%d\n%s", dockerdSubtreeControlMaxAttempts, wrapDockerdScript) + wrapperArgs := []string{ + "--cgroup", + "/bin/sh", + "-c", + shellCmd, + dockerdBinName, + } + wrapperArgs = append(wrapperArgs, dargs...) + return "unshare", wrapperArgs +} + // TODO This is bad code. func filterElements(ss []string, filters ...string) []string { filtered := make([]string, 0, len(ss)) diff --git a/cli/docker_test.go b/cli/docker_test.go index 64ca338..4db427f 100644 --- a/cli/docker_test.go +++ b/cli/docker_test.go @@ -178,18 +178,70 @@ func TestDocker(t *testing.T) { fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR), ) + // dockerd is launched via an unshare wrapper that exec's into + // dockerd with these args. The expected argv is specified inline + // (including the full shell script) so this test fails loudly if + // the wrapper changes; TestWrapDockerdCmd covers the wrapper + // structure independently. + const expectedShellScript = `envbox_max_attempts=100 +# shellcheck shell=sh +# shellcheck disable=SC2154 # envbox_max_attempts is prepended by the Go caller + +# cgroup v2: enable nesting. Mirrors moby's hack/dind L61-79 +# (https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79), +# bounded by envbox_max_attempts. +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + # Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the + # fs root; inner container cgroups end up under the envbox container's + # cgroup on the host. + umount /sys/fs/cgroup || { echo "envbox: failed to umount /sys/fs/cgroup" >&2; exit 1; } + mount -t cgroup2 cgroup /sys/fs/cgroup || { echo "envbox: failed to mount cgroup2 on /sys/fs/cgroup" >&2; exit 1; } + + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + mkdir -p /sys/fs/cgroup/init || { echo "envbox: failed to mkdir /sys/fs/cgroup/init" >&2; exit 1; } + # this happens in a loop because things like "docker exec" on our dind + # container will create new processes, which creates a race between our + # moving everything to "init" and enabling subtree_control + envbox_attempts=0 + while ! { + # move the processes from the root group to the /init group, + # otherwise writing subtree_control fails with EBUSY. + # An error during moving non-existent process (i.e., "cat") is ignored. + xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : + # enable controllers + sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ + > /sys/fs/cgroup/cgroup.subtree_control + }; do + envbox_attempts=$((envbox_attempts + 1)) + if [ "$envbox_attempts" -ge "$envbox_max_attempts" ]; then + echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2 + exit 1 + fi + done +fi +exec "$0" "$@" +` + expectedArgv := []string{ + "unshare", + "--cgroup", + "/bin/sh", + "-c", + expectedShellScript, + "dockerd", + "--debug", + "--log-level=debug", + fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), + "--userns-remap=coder", + "--storage-driver=overlay2", + fmt.Sprintf("--bip=%s", bridgeCIDR), + } + execer := clitest.Execer(ctx) execer.AddCommands(&xunixfake.FakeCmd{ FakeCmd: &testingexec.FakeCmd{ - Argv: []string{ - "dockerd", - "--debug", - "--log-level=debug", - fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), - "--userns-remap=coder", - "--storage-driver=overlay2", - fmt.Sprintf("--bip=%s", bridgeCIDR), - }, + Argv: expectedArgv, }, }) @@ -741,6 +793,45 @@ func TestDocker(t *testing.T) { }) } +func TestWrapDockerdCmd(t *testing.T) { + t.Parallel() + + dargs := []string{"--debug", "--mtu=1500"} + cmd, args := cli.WrapDockerdCmd(dargs) + + // The wrapper invokes `unshare`, exec'ing through `/bin/sh -c