Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions background/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,11 @@ type Process struct {
mu sync.Mutex
}

// New returns an instantiated daemon.
func New(ctx context.Context, log slog.Logger, cmd string, args ...string) *Process {
// New returns an instantiated daemon. binName is the expected
// /proc/<pid>/cmdline value used for exit detection; pass cmd for plain
// invocations, or the post-exec binary name for exec wrappers (e.g.
// "dockerd" when cmd is "unshare ... -- sh -c 'exec dockerd ...'").
func New(ctx context.Context, log slog.Logger, binName, cmd string, args ...string) *Process {
ctx, cancel := context.WithCancel(ctx)
return &Process{
ctx: ctx,
Expand All @@ -44,7 +47,7 @@ func New(ctx context.Context, log slog.Logger, cmd string, args ...string) *Proc
cmd: xunix.GetExecer(ctx).CommandContext(ctx, cmd, args...),
log: log.Named(cmd),
userKilled: i64ptr(0),
binName: cmd,
binName: binName,
}
}

Expand Down Expand Up @@ -80,8 +83,8 @@ func (d *Process) Run() <-chan error {
}

// Restart kill the running process and reruns the command with the updated
// cmd and args.
func (d *Process) Restart(ctx context.Context, cmd string, args ...string) error {
// binName, cmd and args. See New for the meaning of binName.
func (d *Process) Restart(ctx context.Context, binName, cmd string, args ...string) error {
d.mu.Lock()
defer d.mu.Unlock()

Expand All @@ -96,7 +99,7 @@ func (d *Process) Restart(ctx context.Context, cmd string, args ...string) error
d.cmd = xunix.GetExecer(ctx).CommandContext(ctx, cmd, args...)
d.waitCh = make(chan error, 1)
d.userKilled = i64ptr(0)
d.binName = cmd
d.binName = binName

return d.startProcess()
}
Expand Down
49 changes: 44 additions & 5 deletions cli/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package cli

import (
"context"
_ "embed"
"encoding/json"
"fmt"
"io"
Expand Down Expand Up @@ -225,14 +226,14 @@ func dockerCmd() *cobra.Command {
select {
// Start sysbox-mgr and sysbox-fs in order to run
// sysbox containers.
case err := <-background.New(ctx, log, "sysbox-mgr", sysboxArgs...).Run():
case err := <-background.New(ctx, log, "sysbox-mgr", "sysbox-mgr", sysboxArgs...).Run():
if ctx.Err() == nil {
blog.Info(sysboxErrMsg)
//nolint
log.Critical(ctx, "sysbox-mgr exited", slog.Error(err))
panic(err)
}
case err := <-background.New(ctx, log, "sysbox-fs").Run():
case err := <-background.New(ctx, log, "sysbox-fs", "sysbox-fs").Run():
if ctx.Err() == nil {
blog.Info(sysboxErrMsg)
//nolint
Expand All @@ -256,7 +257,8 @@ func dockerCmd() *cobra.Command {
log.Debug(ctx, "starting dockerd", slog.F("args", args))

blog.Info("Waiting for sysbox processes to startup...")
dockerd := background.New(ctx, log, "dockerd", dargs...)
wrapCmd, wrapArgs := wrapDockerdCmd(dargs)
dockerd := background.New(ctx, log, dockerdBinName, wrapCmd, wrapArgs...)
err = dockerd.Start()
if err != nil {
return xerrors.Errorf("start dockerd: %w", err)
Expand Down Expand Up @@ -289,7 +291,8 @@ func dockerCmd() *cobra.Command {
log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err))
}

err = dockerd.Restart(ctx, "dockerd", args...)
wrapCmd, wrapArgs := wrapDockerdCmd(args)
err = dockerd.Restart(ctx, dockerdBinName, wrapCmd, wrapArgs...)
if err != nil {
blog.Info("Failed to create Container-based Virtual Machine: " + err.Error())
//nolint
Expand Down Expand Up @@ -357,7 +360,8 @@ func dockerCmd() *cobra.Command {

log.Debug(ctx, "restarting dockerd", slog.F("args", args))

err = dockerd.Restart(ctx, "dockerd", args...)
wrapCmd, wrapArgs := wrapDockerdCmd(args)
err = dockerd.Restart(ctx, dockerdBinName, wrapCmd, wrapArgs...)
if err != nil {
return xerrors.Errorf("restart dockerd: %w", err)
}
Expand Down Expand Up @@ -881,6 +885,41 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) {
return args, nil
}

// dockerdBinName is the post-exec cmdline of the wrapped dockerd (unshare ->
// sh -> dockerd), used for background.Process exit detection.
const dockerdBinName = "dockerd"

// dockerdSubtreeControlMaxAttempts bounds the cgroup-subtree-control retry
// loop in wrap_dockerd.sh. Diverges from moby's hack/dind (unbounded).
const dockerdSubtreeControlMaxAttempts = 100

//go:embed wrap_dockerd.sh
var wrapDockerdScript string

// wrapDockerdCmd wraps dockerd with `unshare --cgroup` + cgroup2 remount and
// delegation (see wrap_dockerd.sh) so inner container cgroups become
// descendants of the envbox container's own cgroup on the host, restoring
// pod attribution for cgroup-aware tools (Tetragon, Falco, etc.).
//
// We do NOT pass --mount on unshare: the remount intentionally leaks into
// envbox's mount namespace so sysbox-fs's /var/lib/sysboxfs/ mounts stay
// visible to sysbox-runc. xunix.readCPUQuotaCGroupV2 has a fallback for
// the resulting cpu.max path change.
//
// See: https://github.com/moby/moby/issues/45378#issuecomment-2886261231
func wrapDockerdCmd(dargs []string) (string, []string) {
shellCmd := fmt.Sprintf("envbox_max_attempts=%d\n%s", dockerdSubtreeControlMaxAttempts, wrapDockerdScript)
wrapperArgs := []string{
"--cgroup",
"/bin/sh",
"-c",
shellCmd,
dockerdBinName,
}
wrapperArgs = append(wrapperArgs, dargs...)
return "unshare", wrapperArgs
}

// TODO This is bad code.
func filterElements(ss []string, filters ...string) []string {
filtered := make([]string, 0, len(ss))
Expand Down
109 changes: 100 additions & 9 deletions cli/docker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,70 @@ func TestDocker(t *testing.T) {
fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR),
)

// dockerd is launched via an unshare wrapper that exec's into
// dockerd with these args. The expected argv is specified inline
// (including the full shell script) so this test fails loudly if
// the wrapper changes; TestWrapDockerdCmd covers the wrapper
// structure independently.
const expectedShellScript = `envbox_max_attempts=100
# shellcheck shell=sh
# shellcheck disable=SC2154 # envbox_max_attempts is prepended by the Go caller

# cgroup v2: enable nesting. Mirrors moby's hack/dind L61-79
# (https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79),
# bounded by envbox_max_attempts.
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
# Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the
# fs root; inner container cgroups end up under the envbox container's
# cgroup on the host.
umount /sys/fs/cgroup || { echo "envbox: failed to umount /sys/fs/cgroup" >&2; exit 1; }
mount -t cgroup2 cgroup /sys/fs/cgroup || { echo "envbox: failed to mount cgroup2 on /sys/fs/cgroup" >&2; exit 1; }

# move the processes from the root group to the /init group,
# otherwise writing subtree_control fails with EBUSY.
# An error during moving non-existent process (i.e., "cat") is ignored.
mkdir -p /sys/fs/cgroup/init || { echo "envbox: failed to mkdir /sys/fs/cgroup/init" >&2; exit 1; }
# this happens in a loop because things like "docker exec" on our dind
# container will create new processes, which creates a race between our
# moving everything to "init" and enabling subtree_control
envbox_attempts=0
while ! {
# move the processes from the root group to the /init group,
# otherwise writing subtree_control fails with EBUSY.
# An error during moving non-existent process (i.e., "cat") is ignored.
xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
# enable controllers
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
> /sys/fs/cgroup/cgroup.subtree_control
}; do
envbox_attempts=$((envbox_attempts + 1))
if [ "$envbox_attempts" -ge "$envbox_max_attempts" ]; then
echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2
exit 1
fi
done
fi
exec "$0" "$@"
`
expectedArgv := []string{
"unshare",
"--cgroup",
"/bin/sh",
"-c",
expectedShellScript,
"dockerd",
"--debug",
"--log-level=debug",
fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
"--userns-remap=coder",
"--storage-driver=overlay2",
fmt.Sprintf("--bip=%s", bridgeCIDR),
}

execer := clitest.Execer(ctx)
execer.AddCommands(&xunixfake.FakeCmd{
FakeCmd: &testingexec.FakeCmd{
Argv: []string{
"dockerd",
"--debug",
"--log-level=debug",
fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
"--userns-remap=coder",
"--storage-driver=overlay2",
fmt.Sprintf("--bip=%s", bridgeCIDR),
},
Argv: expectedArgv,
},
})

Expand Down Expand Up @@ -741,6 +793,45 @@ func TestDocker(t *testing.T) {
})
}

func TestWrapDockerdCmd(t *testing.T) {
t.Parallel()

dargs := []string{"--debug", "--mtu=1500"}
cmd, args := cli.WrapDockerdCmd(dargs)

// The wrapper invokes `unshare`, exec'ing through `/bin/sh -c <script>`
// and finally exec'ing dockerd. /proc/<pid>/cmdline ends up as "dockerd",
// which is what background.Process tracking should compare against.
require.Equal(t, "unshare", cmd)
require.Equal(t, "dockerd", cli.DockerdBinName)

// Argv prefix: --cgroup /bin/sh -c <script> dockerd <dargs...>
require.GreaterOrEqual(t, len(args), 6, "args=%v", args)
require.Equal(t, "--cgroup", args[0])
require.Equal(t, "/bin/sh", args[1])
require.Equal(t, "-c", args[2])
require.Equal(t, cli.DockerdBinName, args[4])
require.Equal(t, dargs, args[5:])

// The shell script should:
// - prepend the envbox_max_attempts value (so the embedded script can
// reference it as a variable)
// - guard the v2-only block on cgroup.controllers existing
// - perform the umount/mount inside that guard (cgroupv1 hosts unaffected)
// - delegate via /init + subtree_control
// - bound the retry loop against envbox_max_attempts
// - exec into dockerd
script := args[3]
require.Contains(t, script, fmt.Sprintf("envbox_max_attempts=%d", cli.DockerdSubtreeControlMaxAttempts))
require.Contains(t, script, "[ -f /sys/fs/cgroup/cgroup.controllers ]")
require.Contains(t, script, "umount /sys/fs/cgroup")
require.Contains(t, script, "mount -t cgroup2 cgroup /sys/fs/cgroup")
require.Contains(t, script, "mkdir -p /sys/fs/cgroup/init")
require.Contains(t, script, "/sys/fs/cgroup/cgroup.subtree_control")
require.Contains(t, script, `ge "$envbox_max_attempts" ]`)
require.Contains(t, script, `exec "$0" "$@"`)
}

// rawDockerAuth is sample input for a kubernetes secret to a gcr.io private
// registry.
const rawDockerAuth = `{"auths":{"us.gcr.io":{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }","email":"test@test.iam.gserviceaccount.com","auth":"X2pzb25fa2V5OnsKCgkidHlwZSI6ICJzZXJ2aWNlX2FjY291bnQiLAoJInByb2plY3RfaWQiOiAic29tZS10ZXN0IiwKCSJwcml2YXRlX2tleV9pZCI6ICJibGFoYmxhaCIsCgkicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCm15a2V5LS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQoiLAoJImNsaWVudF9lbWFpbCI6ICJ0ZXN0QHRlc3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAoJImNsaWVudF9pZCI6ICIxMjMiLAoJImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKCSJ0b2tlbl91cmkiOiAiaHR0cHM6Ly9vYXV0aDIuZ29vZ2xlYXBpcy5jb20vdG9rZW4iLAoJImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlzLmNvbS9vYXV0aDIvdjEvY2VydHMiLAoJImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvdGVzdC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQo="}}}`
8 changes: 8 additions & 0 deletions cli/export_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package cli

// Aliases to expose internal helpers to the external _test package.
var (
WrapDockerdCmd = wrapDockerdCmd
DockerdBinName = dockerdBinName
DockerdSubtreeControlMaxAttempts = dockerdSubtreeControlMaxAttempts
)
38 changes: 38 additions & 0 deletions cli/wrap_dockerd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# shellcheck shell=sh
# shellcheck disable=SC2154 # envbox_max_attempts is prepended by the Go caller

# cgroup v2: enable nesting. Mirrors moby's hack/dind L61-79
# (https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79),
# bounded by envbox_max_attempts.
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
# Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the
# fs root; inner container cgroups end up under the envbox container's
# cgroup on the host.
umount /sys/fs/cgroup || { echo "envbox: failed to umount /sys/fs/cgroup" >&2; exit 1; }
mount -t cgroup2 cgroup /sys/fs/cgroup || { echo "envbox: failed to mount cgroup2 on /sys/fs/cgroup" >&2; exit 1; }

# move the processes from the root group to the /init group,
# otherwise writing subtree_control fails with EBUSY.
# An error during moving non-existent process (i.e., "cat") is ignored.
mkdir -p /sys/fs/cgroup/init || { echo "envbox: failed to mkdir /sys/fs/cgroup/init" >&2; exit 1; }
# this happens in a loop because things like "docker exec" on our dind
# container will create new processes, which creates a race between our
# moving everything to "init" and enabling subtree_control
envbox_attempts=0
while ! {
# move the processes from the root group to the /init group,
# otherwise writing subtree_control fails with EBUSY.
# An error during moving non-existent process (i.e., "cat") is ignored.
xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
# enable controllers
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
> /sys/fs/cgroup/cgroup.subtree_control
}; do
envbox_attempts=$((envbox_attempts + 1))
if [ "$envbox_attempts" -ge "$envbox_max_attempts" ]; then
echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2
exit 1
fi
done
fi
exec "$0" "$@"
16 changes: 14 additions & 2 deletions xunix/sys.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,21 @@ func readCPUQuotaCGroupV2(ctx context.Context) (CPUQuota, error) {
return CPUQuota{}, xerrors.Errorf("determine own cgroup: %w", err)
}

maxStr, err := afero.ReadFile(fs, filepath.Join("/sys/fs/cgroup/", self, "cpu.max"))
// Try the standard path (self-relative) first.
selfPath := filepath.Join("/sys/fs/cgroup/", self, "cpu.max")
maxStr, err := afero.ReadFile(fs, selfPath)
if err != nil {
return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err)
// Fallback: read cpu.max at the mount root. This handles the case where
// `/sys/fs/cgroup/` has been remounted to be rooted at the current
// cgroup (e.g. after `unshare --cgroup` + `mount -t cgroup2`), so the
// self-relative path no longer exists — but the mount root IS the
// current cgroup and its cpu.max reflects the same limits.
const rootPath = "/sys/fs/cgroup/cpu.max"
rootMaxStr, rootErr := afero.ReadFile(fs, rootPath)
if rootErr != nil {
return CPUQuota{}, xerrors.Errorf("read cpu.max outside container (tried %s: %v; fallback %s: %w)", selfPath, err, rootPath, rootErr)
}
maxStr = rootMaxStr
}

list := strings.Split(string(bytes.TrimSpace(maxStr)), " ")
Expand Down
12 changes: 12 additions & 0 deletions xunix/sys_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,18 @@ func TestReadCPUQuota(t *testing.T) {
},
Expected: xunix.CPUQuota{Quota: -1, Period: 100000, CGroup: xunix.CGroupV2},
},
{
// Simulates the cgroup namespace + remount the dockerd wrapper
// performs: the self-relative path no longer exists but cpu.max
// at the mount root reflects the same limits.
Name: "CGroupV2_RootFallback",
Subpath: "docker/dummy",
FS: map[string]string{
"/proc/self/cgroup": "0::/kubepods/pod/container\n",
"/sys/fs/cgroup/cpu.max": "150000 100000\n",
},
Expected: xunix.CPUQuota{Quota: 150000, Period: 100000, CGroup: xunix.CGroupV2},
},
{
Name: "Empty",
FS: map[string]string{},
Expand Down