Introduction
Sandboxed execution is what makes Codex safe for autonomous operation. By running each task in an isolated container, Codex can take bold actions - installing packages, running arbitrary code, accessing the internet - without risk to user systems.
The Sandbox Philosophy: Give the agent full power within a contained environment. It can't break anything that matters, so let it experiment freely. This unlocks capabilities that would be too dangerous on a local machine.
The Sandbox Model
Each Codex task runs in its own isolated environment:
📝sandbox_model.txt
1┌─────────────────────────────────────────────────────────────┐
2│ HOST SYSTEM │
3│ ┌───────────────────────────────────────────────────────┐ │
4│ │ SANDBOX (Container) │ │
5│ │ ┌─────────────────────────────────────────────────┐ │ │
6│ │ │ Isolated Filesystem │ │ │
7│ │ │ - /workspace (cloned repo) │ │ │
8│ │ │ - /home/agent (agent state) │ │ │
9│ │ │ - Read-only system directories │ │ │
10│ │ └─────────────────────────────────────────────────┘ │ │
11│ │ │ │
12│ │ ┌─────────────────────────────────────────────────┐ │ │
13│ │ │ Isolated Network │ │ │
14│ │ │ - Outbound: allowed (with monitoring) │ │ │
15│ │ │ - Inbound: blocked │ │ │
16│ │ │ - No access to internal networks │ │ │
17│ │ └─────────────────────────────────────────────────┘ │ │
18│ │ │ │
19│ │ ┌─────────────────────────────────────────────────┐ │ │
20│ │ │ Resource Limits │ │ │
21│ │ │ - CPU: 4 cores │ │ │
22│ │ │ - Memory: 16GB │ │ │
23│ │ │ - Disk: 50GB │ │ │
24│ │ │ - Time: 4 hours max │ │ │
25│ │ └─────────────────────────────────────────────────┘ │ │
26│ └───────────────────────────────────────────────────────┘ │
27└─────────────────────────────────────────────────────────────┘Isolation Guarantees
| Aspect | Guarantee | Enforcement |
|---|---|---|
| Filesystem | Cannot access host files | Container namespaces |
| Network | Cannot reach internal services | Network policies |
| Resources | Cannot exhaust host resources | cgroups limits |
| Other tasks | Cannot see other sandboxes | Container isolation |
| Time | Cannot run forever | Timeout enforcement |
Container Technology
Codex sandboxes are likely built on container technology similar to Docker or Firecracker:
🐍sandbox_container.py
1import docker
2from dataclasses import dataclass
3
4@dataclass
5class SandboxConfig:
6 cpu_limit: float = 4.0 # CPU cores
7 memory_limit: str = "16g" # Memory
8 disk_limit: str = "50g" # Disk space
9 timeout: int = 14400 # 4 hours in seconds
10 network_mode: str = "bridge" # Network isolation
11
12
13class SandboxManager:
14 """Manages sandbox containers for code execution."""
15
16 def __init__(self):
17 self.client = docker.from_env()
18
19 def create_sandbox(
20 self,
21 task_id: str,
22 config: SandboxConfig,
23 ) -> str:
24 """Create an isolated sandbox for a task."""
25
26 container = self.client.containers.run(
27 image="codex-sandbox:latest",
28 name=f"sandbox-{task_id}",
29 detach=True,
30 cpu_period=100000,
31 cpu_quota=int(config.cpu_limit * 100000),
32 mem_limit=config.memory_limit,
33 network_mode=config.network_mode,
34 security_opt=["no-new-privileges"],
35 cap_drop=["ALL"], # Drop all capabilities
36 cap_add=["NET_BIND_SERVICE"], # Minimal caps
37 read_only=False,
38 tmpfs={"/tmp": "size=1G"},
39 volumes={
40 f"/data/workspaces/{task_id}": {
41 "bind": "/workspace",
42 "mode": "rw"
43 }
44 },
45 )
46
47 return container.id
48
49 def execute_in_sandbox(
50 self,
51 container_id: str,
52 command: str,
53 timeout: int = 300,
54 ) -> tuple[int, str, str]:
55 """Execute a command in a sandbox."""
56
57 container = self.client.containers.get(container_id)
58
59 result = container.exec_run(
60 command,
61 stream=False,
62 demux=True,
63 )
64
65 stdout = result.output[0].decode() if result.output[0] else ""
66 stderr = result.output[1].decode() if result.output[1] else ""
67
68 return result.exit_code, stdout, stderr
69
70 def destroy_sandbox(self, container_id: str) -> None:
71 """Clean up a sandbox."""
72 container = self.client.containers.get(container_id)
73 container.stop(timeout=10)
74 container.remove(force=True)Firecracker for Production
For production systems, consider Firecracker microVMs instead of Docker. They provide stronger isolation with VM-level boundaries while maintaining container-like performance.
Networking and Internet Access
Codex sandboxes have outbound internet access - a key differentiator from local agents:
🐍network_config.py
1class NetworkPolicy:
2 """Network policies for sandbox containers."""
3
4 def configure_network(self, container_id: str) -> None:
5 # Allow outbound connections
6 self.allow_outbound([
7 # Package registries
8 "registry.npmjs.org",
9 "pypi.org",
10 "rubygems.org",
11
12 # Documentation
13 "docs.python.org",
14 "developer.mozilla.org",
15 "*.github.com",
16
17 # Search
18 "*.google.com",
19 "*.bing.com",
20
21 # APIs (monitored)
22 "api.openai.com",
23 ])
24
25 # Block certain destinations
26 self.block_outbound([
27 # Internal networks
28 "10.0.0.0/8",
29 "172.16.0.0/12",
30 "192.168.0.0/16",
31
32 # Known dangerous services
33 "*.onion",
34 ])
35
36 # Block all inbound
37 self.block_all_inbound()
38
39 # Rate limit requests
40 self.apply_rate_limits(
41 requests_per_minute=100,
42 bandwidth_mbps=50,
43 )What Internet Access Enables
- Documentation lookup: Read official docs during implementation
- Package installation: Install dependencies from registries
- API access: Call external APIs for integration tasks
- Web research: Search for solutions to problems
Resource Management
🐍resource_management.py
1from dataclasses import dataclass
2import time
3
4@dataclass
5class ResourceUsage:
6 cpu_percent: float
7 memory_mb: int
8 disk_mb: int
9 elapsed_seconds: int
10
11
12class ResourceMonitor:
13 """Monitor and enforce resource limits."""
14
15 def __init__(self, limits: SandboxConfig):
16 self.limits = limits
17 self.start_time = time.time()
18
19 def check_limits(self, usage: ResourceUsage) -> bool:
20 """Check if resource usage is within limits."""
21
22 violations = []
23
24 # Check time limit
25 if usage.elapsed_seconds > self.limits.timeout:
26 violations.append(f"Time limit exceeded: {usage.elapsed_seconds}s")
27
28 # Check memory
29 memory_limit_mb = self._parse_memory(self.limits.memory_limit)
30 if usage.memory_mb > memory_limit_mb:
31 violations.append(f"Memory exceeded: {usage.memory_mb}MB")
32
33 # Check disk
34 disk_limit_mb = self._parse_disk(self.limits.disk_limit)
35 if usage.disk_mb > disk_limit_mb:
36 violations.append(f"Disk exceeded: {usage.disk_mb}MB")
37
38 if violations:
39 self.handle_violations(violations)
40 return False
41
42 return True
43
44 def handle_violations(self, violations: list[str]) -> None:
45 """Handle resource limit violations."""
46 # Log violations
47 for v in violations:
48 print(f"Resource violation: {v}")
49
50 # Kill the task
51 self.kill_sandbox()
52
53 # Notify user
54 self.notify_user(violations)Resource Limits Protect Everyone
Resource limits prevent runaway tasks from affecting other users. In multi-tenant systems, this is essential for fair scheduling and cost control.
Implementing Similar Sandboxes
You can implement similar sandboxing for your own agents:
Using Docker
🐍docker_sandbox.py
1import docker
2import tempfile
3import shutil
4from pathlib import Path
5
6class DockerSandbox:
7 """Simple Docker-based sandbox for agent execution."""
8
9 def __init__(self):
10 self.client = docker.from_env()
11
12 def run_task(self, task: str, repo_url: str) -> str:
13 """Run an agent task in a sandbox."""
14
15 # Create workspace
16 workspace = tempfile.mkdtemp()
17
18 try:
19 # Clone repo
20 subprocess.run(["git", "clone", repo_url, workspace])
21
22 # Create container
23 container = self.client.containers.run(
24 "python:3.11",
25 command=f"python -c '{self._agent_script(task)}'",
26 detach=True,
27 volumes={workspace: {"bind": "/workspace", "mode": "rw"}},
28 working_dir="/workspace",
29 mem_limit="4g",
30 cpu_period=100000,
31 cpu_quota=200000, # 2 CPU cores
32 )
33
34 # Wait for completion
35 result = container.wait(timeout=3600)
36
37 # Get output
38 logs = container.logs().decode()
39
40 # Cleanup
41 container.remove()
42
43 return logs
44
45 finally:
46 shutil.rmtree(workspace)
47
48 def _agent_script(self, task: str) -> str:
49 return f'''
50import anthropic
51client = anthropic.Anthropic()
52# Agent implementation here
53'''Using E2B
🐍e2b_sandbox.py
1from e2b import Sandbox
2
3class E2BSandbox:
4 """Use E2B for managed sandboxing."""
5
6 def run_task(self, task: str, repo_url: str) -> str:
7 with Sandbox() as sandbox:
8 # Clone repo
9 sandbox.process.run(f"git clone {repo_url} /workspace")
10
11 # Install dependencies
12 sandbox.process.run("cd /workspace && npm install")
13
14 # Run agent script
15 result = sandbox.process.run(
16 f"cd /workspace && python agent.py '{task}'"
17 )
18
19 return result.stdoutSummary
Sandboxed cloud execution provides:
- Isolation: Tasks can't affect host or other tasks
- Internet access: Full documentation and API access
- Resource limits: CPU, memory, disk, time constraints
- Safety: Agent can experiment freely without risk
- Reproducibility: Clean environment for each task
Next: Let's explore how Codex uses the o3 reasoning model for dynamic, extended thinking during complex tasks.