From df4be9fcd85ddca987b8043c5a47d3a4096e2c81 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Sun, 31 May 2026 01:40:49 +0000
Subject: [PATCH 1/4] Fix pause container of a privileged pod unable to start
 due to /sys mount option mismatch

Starting with v2, containerd mounts /sys as rw on the sandbox container when the
pod is privileged (1fc497218 "Fix privileged container sysfs can't be rw because
pod is ro by default") instead of ro.  This means that the mount list for a
privileged pause container no longer matches with just data.defaultMounts and
will need a special case for sysfs.  Alternative options were also considered -
see the comment in framework.rego.

This change in GCS is necessary even though this can be fixed via a policy
change, because we need to maintain compatibility with existing policies.

This PR converts EnforceCreateContainerPolicy in the LCOW GCS to
EnforceCreateContainerPolicyV2, in order to use the CreateContainerOptions
struct to pass an additional bool indicating whether the current container is a
sandbox container.

This does not allow additional capabilities etc for the sandbox container, only
that sysfs can now be rw.

Assisted-by: GitHub Copilot:claude-opus-4.7
Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 internal/guest/runtime/hcsv2/uvm.go           | 22 ++++++----
 pkg/securitypolicy/framework.rego             | 40 +++++++++++++++++--
 pkg/securitypolicy/securitypolicyenforcer.go  |  3 ++
 .../securitypolicyenforcer_rego.go            |  2 +
 4 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go
index 0cd6448cae..92bf0bb1bf 100644
--- a/internal/guest/runtime/hcsv2/uvm.go
+++ b/internal/guest/runtime/hcsv2/uvm.go
@@ -655,21 +655,27 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM
 		return nil, err
 	}
 
-	envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicy(
+	privileged := isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification)
+	noNewPrivileges := settings.OCISpecification.Process.NoNewPrivileges
+	opts := &securitypolicy.CreateContainerOptions{
+		SandboxID:            sandboxID,
+		Privileged:           &privileged,
+		NoNewPrivileges:      &noNewPrivileges,
+		Groups:               groups,
+		Umask:                umask,
+		Capabilities:         settings.OCISpecification.Process.Capabilities,
+		SeccompProfileSHA256: seccomp,
+		IsSandboxContainer:   c.isSandbox,
+	}
+	envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2(
 		ctx,
-		sandboxID,
 		id,
 		settings.OCISpecification.Process.Args,
 		settings.OCISpecification.Process.Env,
 		settings.OCISpecification.Process.Cwd,
 		settings.OCISpecification.Mounts,
-		isPrivilegedContainerCreationRequest(ctx, settings.OCISpecification),
-		settings.OCISpecification.Process.NoNewPrivileges,
 		user,
-		groups,
-		umask,
-		settings.OCISpecification.Process.Capabilities,
-		seccomp,
+		opts,
 	)
 	if err != nil {
 		return nil, errors.Wrapf(err, "container creation denied due to policy")
diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego
index daa0fe864e..5d90d275e8 100644
--- a/pkg/securitypolicy/framework.rego
+++ b/pkg/securitypolicy/framework.rego
@@ -784,6 +784,40 @@ mount_ok(mounts, allow_elevated, mount) {
     mountConstraint_ok(constraint, mount)
 }
 
+# Special case for the pod sandbox (pause) container: starting with v2,
+# containerd mounts /sys as rw on the sandbox container when the pod is
+# privileged (1fc497218 "Fix privileged container sysfs can't be rw because pod
+# is ro by default") instead of ro.  This means that the mount list for a
+# privileged pause container no longer matches with just data.defaultMounts and
+# will either need a special case for sysfs (which is the only mount being
+# treated differently), or use data.privilegedMounts.  However, if we blindly
+# use data.privilegedMounts, this could result in the host being able to mount
+# "privileged" mounts on even a non-privileged container, as long as it runs as
+# the sandbox.  Since we have no other way to determine if the sandbox should be
+# allowed to be privileged or not (input.privileged is set to false for the
+# pause container even if the pod is privileged), we just special case the sysfs
+# mount.  Furthermore, we only allow this special case if this policy allows any
+# privileged containers at all.
+mount_ok(mounts, allow_elevated, mount) {
+    input.isSandboxContainer
+
+    # we allow allow_elevated to be false since this is what existing policies
+    # already does, even when some workload containers can be privileged, the
+    # sandbox container itself is not.
+
+    mount.type == "sysfs"
+    mount.source == "sysfs"
+    mount.destination == "/sys"
+    count(mount.options) == 4
+    "nosuid" in mount.options
+    "noexec" in mount.options
+    "nodev" in mount.options
+    "rw" in mount.options
+
+    some c in candidate_containers
+    c.allow_elevated
+}
+
 mountList_ok(mounts, allow_elevated) {
     is_linux
     every mount in input.mounts {
@@ -1395,14 +1429,14 @@ filtered_registry_values(input_values, policy_values) := [input_val |
 registry_changes := {"allowed": true} {
     containers := data.metadata.matches[input.containerID]
     container := containers[_]
-    
+
     # Check if container has registry_changes defined in policy
     container.registry_changes
-    
+
     # If input has registry changes, filter to only matching ones
     input.registryChanges.AddValues
     matched_values := filtered_registry_values(input.registryChanges.AddValues, container.registry_changes.add_values)
-    
+
     # Build result with filtered AddValues
     result := {
         "AddValues": matched_values
diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go
index 3f94bae0f5..0c2a98e998 100644
--- a/pkg/securitypolicy/securitypolicyenforcer.go
+++ b/pkg/securitypolicy/securitypolicyenforcer.go
@@ -29,6 +29,9 @@ type CreateContainerOptions struct {
 	Umask                string
 	Capabilities         *oci.LinuxCapabilities
 	SeccompProfileSHA256 string
+	// IsSandboxContainer is true when the container being created is the cri
+	// pod sandbox container (usually it is the "pause" image).
+	IsSandboxContainer bool
 }
 type SignalContainerOptions struct {
 	IsInitProcess bool
diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go
index 6cd911b5ec..5e196ebd9a 100644
--- a/pkg/securitypolicy/securitypolicyenforcer_rego.go
+++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go
@@ -713,6 +713,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy(
 		Umask:                umask,
 		Capabilities:         capabilities,
 		SeccompProfileSHA256: seccompProfileSHA256,
+		IsSandboxContainer:   false,
 	}
 	return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts)
 }
@@ -757,6 +758,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2(
 			"umask":                opts.Umask,
 			"capabilities":         mapifyCapabilities(opts.Capabilities),
 			"seccompProfileSHA256": opts.SeccompProfileSHA256,
+			"isSandboxContainer":   opts.IsSandboxContainer,
 		}
 	case "windows":
 		// Dump full interpreter metadata for debugging diagnostics.

From ddca4c5f19597c441c9d9cb4fe4931daa6852277 Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Mon, 1 Jun 2026 23:02:54 +0000
Subject: [PATCH 2/4] Add tests for sandbox sysfs special-case

Assisted-by: GitHub Copilot:claude-opus-4.7
Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 pkg/securitypolicy/regopolicy_linux_test.go | 164 ++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go
index 236e64d4bd..fbdd283de5 100644
--- a/pkg/securitypolicy/regopolicy_linux_test.go
+++ b/pkg/securitypolicy/regopolicy_linux_test.go
@@ -7696,3 +7696,167 @@ func substituteUVMPath(sandboxID string, m mountInternal) mountInternal {
 	}
 	return m
 }
+
+// setupSandboxSysfsTest builds a policy with exactly two containers: a
+// non-elevated container that the test request will match (acting as the
+// sandbox/pause container in the policy) and a separate elevated container so
+// that candidate_containers has at least one container with allow_elevated set
+// - this gates the sandbox sysfs carve-out in framework.rego.
+//
+// DefaultCRIMounts() is used as the policy's default mount set so the /sys
+// "ro" mount is in data.defaultMounts.
+func setupSandboxSysfsTest(t *testing.T) (
+	policy *regoEnforcer,
+	sandboxContainer *securityPolicyContainer,
+	containerID string,
+	envList []string,
+	user IDName,
+	groups []IDName,
+	capabilities *oci.LinuxCapabilities,
+) {
+	t.Helper()
+
+	gc := generateConstraints(testRand, 2)
+	for len(gc.containers) < 2 {
+		// Force exactly two containers if generator under-generated.
+		gc.containers = append(gc.containers, generateConstraintsContainer(testRand, 1, maxLayersInGeneratedContainer))
+	}
+
+	// containers[0] is the one the test will exercise: act as the pause
+	// container, never elevated. Strip its own mount constraints so the
+	// input mount list is matched purely against defaultMounts and the
+	// sandbox carve-out.
+	sandboxContainer = gc.containers[0]
+	sandboxContainer.AllowElevated = false
+	sandboxContainer.Mounts = nil
+
+	// At least one other container in the policy must be elevated to enable
+	// the sandbox sysfs carve-out.
+	gc.containers[1].AllowElevated = true
+
+	defaultMounts := DefaultCRIMounts()
+	privilegedMounts := DefaultCRIPrivilegedMounts()
+
+	var err error
+	policy, err = newRegoPolicy(gc.toPolicy().marshalRego(), defaultMounts, privilegedMounts, testOSType)
+	if err != nil {
+		t.Fatalf("failed to create policy: %v", err)
+	}
+
+	containerID, err = mountImageForContainer(policy, sandboxContainer)
+	if err != nil {
+		t.Fatalf("failed to mount image for sandbox container: %v", err)
+	}
+
+	envList = buildEnvironmentVariablesFromEnvRules(sandboxContainer.EnvRules, testRand)
+	user = buildIDNameFromConfig(sandboxContainer.User.UserIDName, testRand)
+	groups = buildGroupIDNamesFromUser(sandboxContainer.User, testRand)
+	capsExternal := copyLinuxCapabilities(sandboxContainer.Capabilities.toExternal())
+	capabilities = &capsExternal
+	return policy, sandboxContainer, containerID, envList, user, groups, capabilities
+}
+
+// sysfsMount returns a /sys sysfs mount with the given mode ("ro" or "rw"),
+// matching the option set produced by containerd's CRI sandbox spec.
+func sysfsMount(mode string) oci.Mount {
+	return oci.Mount{
+		Source:      "sysfs",
+		Destination: "/sys",
+		Type:        "sysfs",
+		Options:     []string{"nosuid", "noexec", "nodev", mode},
+	}
+}
+
+func Test_Rego_SandboxSysfsCarveOut(t *testing.T) {
+	cases := []struct {
+		name               string
+		isSandboxContainer bool
+		mode               string
+		expectAllowed      bool
+	}{
+		{"sandbox_ro", true, "ro", true},
+		{"sandbox_rw", true, "rw", true},
+		{"non_sandbox_ro", false, "ro", true},
+		{"non_sandbox_rw", false, "rw", false},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Each subtest gets its own policy because a successful
+			// create_container records the container as "started" and a
+			// second call for the same containerID would be denied.
+			policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
+				setupSandboxSysfsTest(t)
+			noNewPriv := sandboxContainer.NoNewPrivileges
+			privileged := false
+			mounts := []oci.Mount{sysfsMount(tc.mode)}
+			_, _, _, err := policy.EnforceCreateContainerPolicyV2(
+				context.Background(),
+				containerID,
+				sandboxContainer.Command,
+				envList,
+				sandboxContainer.WorkingDir,
+				mounts,
+				user,
+				&CreateContainerOptions{
+					SandboxID:            testDataGenerator.uniqueSandboxID(),
+					Privileged:           &privileged,
+					NoNewPrivileges:      &noNewPriv,
+					Groups:               groups,
+					Umask:                sandboxContainer.User.Umask,
+					Capabilities:         capabilities,
+					SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256,
+					IsSandboxContainer:   tc.isSandboxContainer,
+				},
+			)
+			if tc.expectAllowed {
+				if err != nil {
+					t.Errorf("expected allowed, got error: %v", err)
+				}
+			} else {
+				if err == nil {
+					t.Errorf("expected denied, got allowed")
+				} else {
+					assertDecisionJSONContains(t, err, "invalid mount list", "/sys")
+				}
+			}
+		})
+	}
+}
+
+// Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied verifies that the
+// sysfs carve-out for the sandbox container does NOT also grant privilege:
+// even with IsSandboxContainer=true and the /sys rw mount accepted, if the
+// host requests Privileged=true for a sandbox container whose policy entry
+// does not allow elevation, create_container must still be denied.
+func Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied(t *testing.T) {
+	policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
+		setupSandboxSysfsTest(t)
+
+	noNewPriv := sandboxContainer.NoNewPrivileges
+	privileged := true
+	mounts := []oci.Mount{sysfsMount("rw")}
+	_, _, _, err := policy.EnforceCreateContainerPolicyV2(
+		context.Background(),
+		containerID,
+		sandboxContainer.Command,
+		envList,
+		sandboxContainer.WorkingDir,
+		mounts,
+		user,
+		&CreateContainerOptions{
+			SandboxID:            testDataGenerator.uniqueSandboxID(),
+			Privileged:           &privileged,
+			NoNewPrivileges:      &noNewPriv,
+			Groups:               groups,
+			Umask:                sandboxContainer.User.Umask,
+			Capabilities:         capabilities,
+			SeccompProfileSHA256: sandboxContainer.SeccompProfileSHA256,
+			IsSandboxContainer:   true,
+		},
+	)
+	if err == nil {
+		t.Fatal("expected create_container to be denied when Privileged=true for a non-elevated sandbox container, but it was allowed")
+	}
+	assertDecisionJSONContains(t, err, "privileged escalation not allowed")
+}

From 649752a1111b5f4b8620b3874bf3cc62b0b72c0d Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Wed, 17 Jun 2026 11:52:14 +0000
Subject: [PATCH 3/4] framework.rego: Don't require
 candidate_containers[_].allow_elevated for sysfs rw special case

Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 pkg/securitypolicy/framework.rego | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego
index 5d90d275e8..76e5b048a0 100644
--- a/pkg/securitypolicy/framework.rego
+++ b/pkg/securitypolicy/framework.rego
@@ -796,8 +796,11 @@ mount_ok(mounts, allow_elevated, mount) {
 # the sandbox.  Since we have no other way to determine if the sandbox should be
 # allowed to be privileged or not (input.privileged is set to false for the
 # pause container even if the pod is privileged), we just special case the sysfs
-# mount.  Furthermore, we only allow this special case if this policy allows any
-# privileged containers at all.
+# mount.
+#
+# We have to allow this special case whether or not this policy currently allows
+# any privileged containers at all, since a fragment that is loaded in the
+# future may allow privileged containers.
 mount_ok(mounts, allow_elevated, mount) {
     input.isSandboxContainer
 
@@ -813,9 +816,6 @@ mount_ok(mounts, allow_elevated, mount) {
     "noexec" in mount.options
     "nodev" in mount.options
     "rw" in mount.options
-
-    some c in candidate_containers
-    c.allow_elevated
 }
 
 mountList_ok(mounts, allow_elevated) {

From 3f3657f199a40e7209ed3b8c33c990206969b75f Mon Sep 17 00:00:00 2001
From: Tingmao Wang <tingmaowang@microsoft.com>
Date: Wed, 17 Jun 2026 12:00:48 +0000
Subject: [PATCH 4/4] Add tests to check that sysfs rw special case is applied
 even when no container in the policy is elevated

Assisted-by: GitHub Copilot copilot-review
Signed-off-by: Tingmao Wang <tingmaowang@microsoft.com>
---
 pkg/securitypolicy/regopolicy_linux_test.go | 31 +++++++++++----------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go
index fbdd283de5..51da87a18c 100644
--- a/pkg/securitypolicy/regopolicy_linux_test.go
+++ b/pkg/securitypolicy/regopolicy_linux_test.go
@@ -7705,7 +7705,7 @@ func substituteUVMPath(sandboxID string, m mountInternal) mountInternal {
 //
 // DefaultCRIMounts() is used as the policy's default mount set so the /sys
 // "ro" mount is in data.defaultMounts.
-func setupSandboxSysfsTest(t *testing.T) (
+func setupSandboxSysfsTest(t *testing.T, workloadIsPrivileged bool) (
 	policy *regoEnforcer,
 	sandboxContainer *securityPolicyContainer,
 	containerID string,
@@ -7730,9 +7730,7 @@ func setupSandboxSysfsTest(t *testing.T) (
 	sandboxContainer.AllowElevated = false
 	sandboxContainer.Mounts = nil
 
-	// At least one other container in the policy must be elevated to enable
-	// the sandbox sysfs carve-out.
-	gc.containers[1].AllowElevated = true
+	gc.containers[1].AllowElevated = workloadIsPrivileged
 
 	defaultMounts := DefaultCRIMounts()
 	privilegedMounts := DefaultCRIPrivilegedMounts()
@@ -7769,15 +7767,20 @@ func sysfsMount(mode string) oci.Mount {
 
 func Test_Rego_SandboxSysfsCarveOut(t *testing.T) {
 	cases := []struct {
-		name               string
-		isSandboxContainer bool
-		mode               string
-		expectAllowed      bool
+		name                 string
+		isSandboxContainer   bool
+		mode                 string
+		expectAllowed        bool
+		workloadIsPrivileged bool
 	}{
-		{"sandbox_ro", true, "ro", true},
-		{"sandbox_rw", true, "rw", true},
-		{"non_sandbox_ro", false, "ro", true},
-		{"non_sandbox_rw", false, "rw", false},
+		{"sandbox_ro", true, "ro", true, true},
+		{"sandbox_rw", true, "rw", true, true},
+		// sysfs rw is allowed for sandbox containers even if no container in
+		// the policy is elevated (a future fragment might allow elevation).
+		{"sandbox_rw_no_elevated_workload", true, "rw", true, false},
+		{"non_sandbox_ro", false, "ro", true, false},
+		{"non_sandbox_rw", false, "rw", false, false},
+		{"non_sandbox_rw_elevated_workload", false, "rw", false, true},
 	}
 
 	for _, tc := range cases {
@@ -7786,7 +7789,7 @@ func Test_Rego_SandboxSysfsCarveOut(t *testing.T) {
 			// create_container records the container as "started" and a
 			// second call for the same containerID would be denied.
 			policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
-				setupSandboxSysfsTest(t)
+				setupSandboxSysfsTest(t, tc.workloadIsPrivileged)
 			noNewPriv := sandboxContainer.NoNewPrivileges
 			privileged := false
 			mounts := []oci.Mount{sysfsMount(tc.mode)}
@@ -7831,7 +7834,7 @@ func Test_Rego_SandboxSysfsCarveOut(t *testing.T) {
 // does not allow elevation, create_container must still be denied.
 func Test_Rego_SandboxSysfsCarveOut_PrivilegedRequestDenied(t *testing.T) {
 	policy, sandboxContainer, containerID, envList, user, groups, capabilities :=
-		setupSandboxSysfsTest(t)
+		setupSandboxSysfsTest(t, false)
 
 	noNewPriv := sandboxContainer.NoNewPrivileges
 	privileged := true