Introduction
Codex's cloud-based architecture enables a unique safety model: the agent can take bold actions in its sandbox, but changes only reach your code through pull requests. This creates a verification checkpoint that balances autonomy with control.
The Safety Philosophy: Let the agent experiment freely in isolation. Require human review before changes merge. This gives agents the freedom to try things while keeping humans in control of what ships.
Codex Safety Model
📝safety_model.txt
1┌────────────────────────────────────────────────────────────────┐
2│ CODEX SAFETY MODEL │
3├────────────────────────────────────────────────────────────────┤
4│ │
5│ ┌──────────────────────────────────────────────────────────┐ │
6│ │ SANDBOX BOUNDARY │ │
7│ │ ┌─────────────────────────────────────────────────────┐ │ │
8│ │ │ Agent Actions (PERMITTED) │ │ │
9│ │ │ ✓ Write any code │ │ │
10│ │ │ ✓ Install any packages │ │ │
11│ │ │ ✓ Run any commands │ │ │
12│ │ │ ✓ Access internet │ │ │
13│ │ │ ✓ Create/modify/delete files │ │ │
14│ │ └─────────────────────────────────────────────────────┘ │ │
15│ │ │ │ │
16│ │ ▼ │ │
17│ │ ┌─────────────────────────────────────────────────────┐ │ │
18│ │ │ Output (VERIFIED) │ │ │
19│ │ │ • Git diff of changes │ │ │
20│ │ │ • Test results │ │ │
21│ │ │ • Build status │ │ │
22│ │ │ • Execution logs │ │ │
23│ │ └─────────────────────────────────────────────────────┘ │ │
24│ └──────────────────────────────────────────────────────────┘ │
25│ │ │
26│ ▼ │
27│ ┌──────────────────────────────────────────────────────────┐ │
28│ │ HUMAN CHECKPOINT (Required) │ │
29│ │ • Review pull request │ │
30│ │ • Check diff for issues │ │
31│ │ • Verify tests pass │ │
32│ │ • Approve or request changes │ │
33│ └──────────────────────────────────────────────────────────┘ │
34│ │ │
35│ ▼ │
36│ ┌──────────────────────────────────────────────────────────┐ │
37│ │ PRODUCTION CODE │ │
38│ │ Changes merged only after human approval │ │
39│ └──────────────────────────────────────────────────────────┘ │
40│ │
41└────────────────────────────────────────────────────────────────┘Safety Layers
| Layer | Protection | How It Works |
|---|---|---|
| Sandbox isolation | System protection | Container isolation, no host access |
| Resource limits | Runaway prevention | CPU, memory, time limits |
| Network policies | Data exfiltration | Restricted outbound, no inbound |
| PR workflow | Code quality | Human review required |
| CI/CD integration | Automation check | Tests must pass |
Verification Strategies
Automated Verification
🐍automated_verification.py
1class CodexVerification:
2 """Verification steps before creating PR."""
3
4 def verify_changes(self, workspace_path: str) -> VerificationResult:
5 """Run all verification checks."""
6
7 results = []
8
9 # 1. Type checking
10 type_check = self.run_type_check(workspace_path)
11 results.append(("types", type_check))
12
13 # 2. Linting
14 lint = self.run_linter(workspace_path)
15 results.append(("lint", lint))
16
17 # 3. Tests
18 tests = self.run_tests(workspace_path)
19 results.append(("tests", tests))
20
21 # 4. Build
22 build = self.run_build(workspace_path)
23 results.append(("build", build))
24
25 # 5. Security scan
26 security = self.run_security_scan(workspace_path)
27 results.append(("security", security))
28
29 return VerificationResult(
30 passed=all(r[1].success for r in results),
31 checks=results,
32 )
33
34 def run_type_check(self, path: str) -> CheckResult:
35 """TypeScript type checking."""
36 result = subprocess.run(
37 ["npx", "tsc", "--noEmit"],
38 cwd=path,
39 capture_output=True,
40 )
41 return CheckResult(
42 success=result.returncode == 0,
43 output=result.stderr.decode(),
44 )
45
46 def run_tests(self, path: str) -> CheckResult:
47 """Run test suite."""
48 result = subprocess.run(
49 ["npm", "test", "--", "--passWithNoTests"],
50 cwd=path,
51 capture_output=True,
52 )
53 return CheckResult(
54 success=result.returncode == 0,
55 output=result.stdout.decode(),
56 )
57
58 def run_security_scan(self, path: str) -> CheckResult:
59 """Scan for security issues."""
60 # Check for secrets
61 secrets = self.scan_for_secrets(path)
62 if secrets:
63 return CheckResult(
64 success=False,
65 output=f"Found potential secrets: {secrets}",
66 )
67
68 # Check dependencies
69 dep_scan = subprocess.run(
70 ["npm", "audit", "--json"],
71 cwd=path,
72 capture_output=True,
73 )
74
75 return CheckResult(
76 success=dep_scan.returncode == 0,
77 output=dep_scan.stdout.decode(),
78 )Self-Verification by Agent
🐍self_verification.py
1class AgentSelfVerification:
2 """Agent verifies its own work before submitting."""
3
4 def verify_task_completion(
5 self,
6 task: str,
7 changes: list[FileChange],
8 ) -> bool:
9 """Have the agent verify its own work."""
10
11 verification_prompt = f"""
12Task that was assigned:
13{task}
14
15Changes made:
16{self.format_changes(changes)}
17
18Please verify:
191. Do these changes fully complete the task?
202. Are there any bugs or issues in the code?
213. Are all edge cases handled?
224. Are there any security concerns?
235. Is the code style consistent with the project?
24
25If there are issues, describe them. Otherwise, confirm completion.
26"""
27
28 response = self.llm.generate(verification_prompt)
29
30 # Parse response for issues
31 if "issue" in response.lower() or "bug" in response.lower():
32 return False
33
34 return True
35
36 def iterative_refinement(
37 self,
38 task: str,
39 max_iterations: int = 3,
40 ) -> WorkResult:
41 """Iterate until self-verification passes."""
42
43 for iteration in range(max_iterations):
44 # Make changes
45 changes = self.implement_changes(task)
46
47 # Self-verify
48 if self.verify_task_completion(task, changes):
49 return WorkResult(
50 success=True,
51 changes=changes,
52 iterations=iteration + 1,
53 )
54
55 # If not verified, analyze issues and retry
56 task = self.refine_task_with_issues(task, changes)
57
58 return WorkResult(
59 success=False,
60 changes=changes,
61 iterations=max_iterations,
62 reason="Max iterations reached",
63 )The PR Workflow
Pull requests are the gateway between agent work and production code:
🐍pr_workflow.py
1class CodexPRWorkflow:
2 """Manage the PR creation and review workflow."""
3
4 def create_pr(
5 self,
6 task: str,
7 changes: list[FileChange],
8 verification: VerificationResult,
9 ) -> PullRequest:
10 """Create a PR from agent work."""
11
12 # Create branch
13 branch_name = self.generate_branch_name(task)
14 self.git.checkout_new_branch(branch_name)
15
16 # Commit changes
17 self.git.add_all()
18 commit_message = self.generate_commit_message(task, changes)
19 self.git.commit(commit_message)
20
21 # Push
22 self.git.push(branch_name)
23
24 # Create PR
25 pr_body = self.generate_pr_body(
26 task=task,
27 changes=changes,
28 verification=verification,
29 )
30
31 pr = self.github.create_pull_request(
32 title=self.generate_pr_title(task),
33 body=pr_body,
34 head=branch_name,
35 base="main",
36 )
37
38 return pr
39
40 def generate_pr_body(
41 self,
42 task: str,
43 changes: list[FileChange],
44 verification: VerificationResult,
45 ) -> str:
46 """Generate comprehensive PR description."""
47
48 return f"""
49## Summary
50
51{self.summarize_changes(task, changes)}
52
53## Changes
54
55{self.list_changes(changes)}
56
57## Verification Results
58
59| Check | Status |
60|-------|--------|
61| Types | {"✅ Pass" if verification.checks["types"].success else "❌ Fail"} |
62| Lint | {"✅ Pass" if verification.checks["lint"].success else "❌ Fail"} |
63| Tests | {"✅ Pass" if verification.checks["tests"].success else "❌ Fail"} |
64| Build | {"✅ Pass" if verification.checks["build"].success else "❌ Fail"} |
65| Security | {"✅ Pass" if verification.checks["security"].success else "❌ Fail"} |
66
67## Agent Notes
68
69{self.agent_notes}
70
71## Review Checklist
72
73- [ ] Code changes look correct
74- [ ] Tests are comprehensive
75- [ ] No security concerns
76- [ ] Documentation updated if needed
77
78---
79*Generated by Codex*
80"""
81
82 def handle_review_feedback(
83 self,
84 pr: PullRequest,
85 feedback: list[ReviewComment],
86 ) -> None:
87 """Process review feedback and update PR."""
88
89 # Analyze feedback
90 for comment in feedback:
91 if comment.requires_changes:
92 # Make requested changes
93 fix = self.address_comment(comment)
94
95 # Commit fix
96 self.git.commit(
97 f"Address review: {comment.summary}"
98 )
99
100 # Push updates
101 self.git.push(pr.branch)
102
103 # Reply to comments
104 for comment in feedback:
105 self.github.reply_to_comment(
106 comment.id,
107 f"Addressed in latest commit.",
108 )PR Content Best Practices
- Clear summary: What was done and why
- Change list: Files modified with brief descriptions
- Test results: Verification status for each check
- Agent notes: Any considerations or alternatives explored
- Review focus: Areas that need careful review
Implementing Similar Safeguards
Pattern 1: Staged Execution
🐍staged_execution.py
1class StagedExecution:
2 """Execute changes in stages with checkpoints."""
3
4 def execute_with_staging(self, task: str) -> ExecutionResult:
5 """Stage 1: Plan (no side effects)."""
6 plan = self.create_plan(task)
7
8 # Checkpoint: Review plan
9 if not self.approve_plan(plan):
10 return ExecutionResult(
11 success=False,
12 stage="planning",
13 reason="Plan not approved",
14 )
15
16 """Stage 2: Implement in sandbox."""
17 with self.sandbox() as sandbox:
18 implementation = self.implement(plan, sandbox)
19
20 # Checkpoint: Verify implementation
21 verification = self.verify(implementation, sandbox)
22
23 if not verification.passed:
24 return ExecutionResult(
25 success=False,
26 stage="verification",
27 reason=verification.failures,
28 )
29
30 """Stage 3: Create PR (human review)."""
31 pr = self.create_pr(implementation)
32
33 return ExecutionResult(
34 success=True,
35 stage="complete",
36 pr=pr,
37 )Pattern 2: Approval Levels
🐍approval_levels.py
1from enum import Enum
2
3class RiskLevel(Enum):
4 LOW = "low" # Add comment, fix typo
5 MEDIUM = "medium" # New function, simple bug fix
6 HIGH = "high" # New feature, refactor
7 CRITICAL = "critical" # Security, database, auth
8
9class ApprovalPolicy:
10 """Define approval requirements based on risk."""
11
12 POLICIES = {
13 RiskLevel.LOW: {
14 "auto_merge": True,
15 "required_reviewers": 0,
16 "tests_required": True,
17 },
18 RiskLevel.MEDIUM: {
19 "auto_merge": False,
20 "required_reviewers": 1,
21 "tests_required": True,
22 },
23 RiskLevel.HIGH: {
24 "auto_merge": False,
25 "required_reviewers": 2,
26 "tests_required": True,
27 "security_review": True,
28 },
29 RiskLevel.CRITICAL: {
30 "auto_merge": False,
31 "required_reviewers": 2,
32 "tests_required": True,
33 "security_review": True,
34 "manual_verification": True,
35 },
36 }
37
38 def assess_risk(self, changes: list[FileChange]) -> RiskLevel:
39 """Assess risk level of changes."""
40
41 # High-risk patterns
42 critical_patterns = [
43 "auth", "security", "password", "token",
44 "database", "migration", "env", "secret",
45 ]
46
47 high_patterns = [
48 "payment", "user", "permission", "role",
49 "api", "external", "integration",
50 ]
51
52 for change in changes:
53 path_lower = change.path.lower()
54
55 for pattern in critical_patterns:
56 if pattern in path_lower:
57 return RiskLevel.CRITICAL
58
59 for pattern in high_patterns:
60 if pattern in path_lower:
61 return RiskLevel.HIGH
62
63 # Check change size
64 total_lines = sum(c.lines_changed for c in changes)
65 if total_lines > 500:
66 return RiskLevel.HIGH
67 if total_lines > 100:
68 return RiskLevel.MEDIUM
69
70 return RiskLevel.LOW
71
72 def get_requirements(self, risk: RiskLevel) -> dict:
73 return self.POLICIES[risk]Pattern 3: Rollback Capability
🐍rollback_capability.py
1class RollbackManager:
2 """Manage rollback capability for agent changes."""
3
4 def __init__(self):
5 self.checkpoints = []
6
7 def create_checkpoint(self, label: str) -> str:
8 """Create a checkpoint before changes."""
9 checkpoint_id = self.git.create_tag(
10 f"checkpoint-{label}-{time.time()}"
11 )
12 self.checkpoints.append(checkpoint_id)
13 return checkpoint_id
14
15 def rollback_to(self, checkpoint_id: str) -> None:
16 """Rollback to a specific checkpoint."""
17 self.git.reset_hard(checkpoint_id)
18
19 def rollback_last(self) -> None:
20 """Rollback to the last checkpoint."""
21 if self.checkpoints:
22 self.rollback_to(self.checkpoints[-1])
23
24 def execute_with_rollback(
25 self,
26 action: Callable,
27 *args,
28 **kwargs,
29 ) -> Any:
30 """Execute action with automatic rollback on failure."""
31 checkpoint = self.create_checkpoint("pre-action")
32
33 try:
34 result = action(*args, **kwargs)
35
36 # Verify result
37 if not self.verify_result(result):
38 self.rollback_to(checkpoint)
39 raise VerificationError("Result verification failed")
40
41 return result
42
43 except Exception as e:
44 self.rollback_to(checkpoint)
45 raiseAlways Have Rollback
Agent actions can have unintended consequences. Always maintain the ability to rollback quickly. Test your rollback procedures regularly.
Summary
Safety and verification in Codex:
- Sandbox isolation: Agent can't affect host or other tasks
- Automated verification: Types, tests, lint, build, security
- PR workflow: Human review before merge
- Risk assessment: Match review requirements to change risk
- Rollback capability: Always be able to undo
Chapter Complete: You now understand how OpenAI Codex works - from sandboxed execution to dynamic reasoning to safety controls. Next, we'll explore Google's Gemini agents and their multimodal architecture.