Introduction
Debugging AI agents is uniquely challenging because their behavior is non-deterministic, context-dependent, and emerges from complex interactions between prompts, tools, and memory. This section covers techniques for understanding and fixing agent issues.
Section Overview: We'll explore common agent issues, replay debugging, interactive debugging techniques, and specialized debugging tools.
Common Agent Issues
Issue Categories
| Issue Type | Symptoms | Common Causes |
|---|---|---|
| Loops | Same action repeated | Unclear goal, poor prompting |
| Goal drift | Gradual topic shift | Long context, weak constraints |
| Hallucination | False information | Knowledge gaps, overconfidence |
| Tool misuse | Wrong tool selection | Unclear tool descriptions |
| Memory failures | Forgets context | Context window limits |
| Stuck states | No progress | Ambiguous state, missing info |
🐍python
1"""
2Common Agent Issues and Detection
3
4Automated detection of common agent problems.
5"""
6
7from dataclasses import dataclass
8from datetime import datetime, timedelta
9from typing import Any
10
11
12@dataclass
13class DebugIssue:
14 """A detected debugging issue."""
15 issue_type: str
16 severity: str # low, medium, high, critical
17 description: str
18 evidence: list[dict]
19 suggested_fix: str
20
21
22class AgentDebugger:
23 """Detect and diagnose common agent issues."""
24
25 def __init__(self):
26 self.action_history: list[dict] = []
27 self.thought_history: list[str] = []
28 self.goal_history: list[str] = []
29
30 def record_action(self, action: dict):
31 """Record an agent action."""
32 self.action_history.append({
33 **action,
34 "timestamp": datetime.now()
35 })
36
37 def record_thought(self, thought: str):
38 """Record an agent thought."""
39 self.thought_history.append(thought)
40
41 def record_goal(self, goal: str):
42 """Record goal state."""
43 self.goal_history.append(goal)
44
45 def diagnose(self) -> list[DebugIssue]:
46 """Run all diagnostic checks."""
47 issues = []
48
49 issues.extend(self._check_loops())
50 issues.extend(self._check_goal_drift())
51 issues.extend(self._check_stuck_state())
52 issues.extend(self._check_tool_patterns())
53
54 return sorted(issues, key=lambda i: self._severity_order(i.severity))
55
56 def _check_loops(self) -> list[DebugIssue]:
57 """Detect action loops."""
58 issues = []
59
60 if len(self.action_history) < 3:
61 return issues
62
63 # Check for exact repeats
64 recent = self.action_history[-5:]
65 action_keys = [
66 f"{a['type']}:{str(a.get('input', ''))[:50]}"
67 for a in recent
68 ]
69
70 if len(set(action_keys)) == 1:
71 issues.append(DebugIssue(
72 issue_type="exact_loop",
73 severity="high",
74 description="Agent is repeating the exact same action",
75 evidence=[{"actions": action_keys}],
76 suggested_fix="Add loop detection in prompt or implement action deduplication"
77 ))
78
79 # Check for oscillation (A, B, A, B pattern)
80 if len(action_keys) >= 4:
81 if action_keys[-1] == action_keys[-3] and action_keys[-2] == action_keys[-4]:
82 issues.append(DebugIssue(
83 issue_type="oscillation",
84 severity="medium",
85 description="Agent is oscillating between two actions",
86 evidence=[{"pattern": action_keys[-4:]}],
87 suggested_fix="Add state memory to track attempted approaches"
88 ))
89
90 return issues
91
92 def _check_goal_drift(self) -> list[DebugIssue]:
93 """Detect drift from original goal."""
94 issues = []
95
96 if len(self.goal_history) < 2:
97 return issues
98
99 original = self.goal_history[0]
100 current = self.goal_history[-1]
101
102 # Simple similarity check (use embeddings in production)
103 original_words = set(original.lower().split())
104 current_words = set(current.lower().split())
105
106 overlap = len(original_words & current_words)
107 total = len(original_words | current_words)
108 similarity = overlap / total if total > 0 else 0
109
110 if similarity < 0.3:
111 issues.append(DebugIssue(
112 issue_type="goal_drift",
113 severity="high",
114 description=f"Current goal has drifted significantly from original ({similarity:.0%} overlap)",
115 evidence=[
116 {"original_goal": original},
117 {"current_goal": current}
118 ],
119 suggested_fix="Add goal anchoring to prompts or periodic goal re-verification"
120 ))
121
122 return issues
123
124 def _check_stuck_state(self) -> list[DebugIssue]:
125 """Detect if agent is stuck."""
126 issues = []
127
128 if len(self.action_history) < 5:
129 return issues
130
131 # Check if no progress in last N actions
132 recent_results = [
133 a.get("result", {}).get("success", True)
134 for a in self.action_history[-5:]
135 ]
136
137 if all(not r for r in recent_results):
138 issues.append(DebugIssue(
139 issue_type="stuck_state",
140 severity="high",
141 description="Last 5 actions all failed - agent may be stuck",
142 evidence=[
143 {"failed_actions": [a["type"] for a in self.action_history[-5:]]}
144 ],
145 suggested_fix="Implement failure recovery strategy or escalation"
146 ))
147
148 return issues
149
150 def _check_tool_patterns(self) -> list[DebugIssue]:
151 """Detect problematic tool usage patterns."""
152 issues = []
153
154 if len(self.action_history) < 3:
155 return issues
156
157 # Check for tool with consistently bad results
158 tool_results: dict[str, list[bool]] = {}
159 for action in self.action_history:
160 tool = action.get("type", "unknown")
161 success = action.get("result", {}).get("success", True)
162 if tool not in tool_results:
163 tool_results[tool] = []
164 tool_results[tool].append(success)
165
166 for tool, results in tool_results.items():
167 if len(results) >= 3 and not any(results[-3:]):
168 issues.append(DebugIssue(
169 issue_type="tool_failure",
170 severity="medium",
171 description=f"Tool '{tool}' has failed last 3 attempts",
172 evidence=[{"tool": tool, "failure_count": len(results)}],
173 suggested_fix="Check tool configuration or add fallback tool"
174 ))
175
176 return issues
177
178 def _severity_order(self, severity: str) -> int:
179 return {"critical": 0, "high": 1, "medium": 2, "low": 3}.get(severity, 4)Replay Debugging
Reproducing Agent Behavior
🐍python
1"""
2Replay Debugging
3
4Record and replay agent executions for debugging.
5"""
6
7from dataclasses import dataclass, field
8from datetime import datetime
9from typing import Any
10import json
11
12
13@dataclass
14class AgentSnapshot:
15 """Snapshot of agent state at a point in time."""
16 timestamp: datetime
17 iteration: int
18 messages: list[dict]
19 memory_state: dict
20 pending_actions: list[dict]
21 context: dict
22
23
24@dataclass
25class ExecutionRecording:
26 """Complete recording of an agent execution."""
27 recording_id: str
28 task: str
29 config: dict
30 snapshots: list[AgentSnapshot] = field(default_factory=list)
31 llm_calls: list[dict] = field(default_factory=list)
32 tool_calls: list[dict] = field(default_factory=list)
33 final_result: dict | None = None
34
35
36class AgentRecorder:
37 """Record agent execution for replay."""
38
39 def __init__(self, recording_id: str):
40 self.recording = ExecutionRecording(
41 recording_id=recording_id,
42 task="",
43 config={}
44 )
45 self.iteration = 0
46
47 def start_recording(self, task: str, config: dict):
48 """Start a new recording."""
49 self.recording.task = task
50 self.recording.config = config
51
52 def snapshot(
53 self,
54 messages: list[dict],
55 memory_state: dict,
56 pending_actions: list[dict],
57 context: dict
58 ):
59 """Take a snapshot of current state."""
60 self.iteration += 1
61 self.recording.snapshots.append(AgentSnapshot(
62 timestamp=datetime.now(),
63 iteration=self.iteration,
64 messages=messages.copy(),
65 memory_state=memory_state.copy(),
66 pending_actions=pending_actions.copy(),
67 context=context.copy()
68 ))
69
70 def record_llm_call(
71 self,
72 request: dict,
73 response: dict,
74 duration_ms: float
75 ):
76 """Record an LLM API call."""
77 self.recording.llm_calls.append({
78 "iteration": self.iteration,
79 "timestamp": datetime.now().isoformat(),
80 "request": request,
81 "response": response,
82 "duration_ms": duration_ms
83 })
84
85 def record_tool_call(
86 self,
87 tool_name: str,
88 input_data: dict,
89 output_data: dict,
90 success: bool
91 ):
92 """Record a tool call."""
93 self.recording.tool_calls.append({
94 "iteration": self.iteration,
95 "timestamp": datetime.now().isoformat(),
96 "tool": tool_name,
97 "input": input_data,
98 "output": output_data,
99 "success": success
100 })
101
102 def end_recording(self, result: dict):
103 """End the recording."""
104 self.recording.final_result = result
105
106 def save(self, path: str):
107 """Save recording to file."""
108 data = {
109 "recording_id": self.recording.recording_id,
110 "task": self.recording.task,
111 "config": self.recording.config,
112 "snapshots": [
113 {
114 "timestamp": s.timestamp.isoformat(),
115 "iteration": s.iteration,
116 "messages": s.messages,
117 "memory_state": s.memory_state,
118 "pending_actions": s.pending_actions,
119 "context": s.context
120 }
121 for s in self.recording.snapshots
122 ],
123 "llm_calls": self.recording.llm_calls,
124 "tool_calls": self.recording.tool_calls,
125 "final_result": self.recording.final_result
126 }
127 with open(path, "w") as f:
128 json.dump(data, f, indent=2)
129
130
131class AgentReplayer:
132 """Replay recorded agent executions."""
133
134 def __init__(self, recording_path: str):
135 with open(recording_path) as f:
136 data = json.load(f)
137
138 self.recording = data
139 self.current_snapshot = 0
140
141 def get_snapshot(self, iteration: int) -> dict | None:
142 """Get snapshot at specific iteration."""
143 for snapshot in self.recording["snapshots"]:
144 if snapshot["iteration"] == iteration:
145 return snapshot
146 return None
147
148 def get_llm_calls_at(self, iteration: int) -> list[dict]:
149 """Get LLM calls at specific iteration."""
150 return [
151 call for call in self.recording["llm_calls"]
152 if call["iteration"] == iteration
153 ]
154
155 def get_tool_calls_at(self, iteration: int) -> list[dict]:
156 """Get tool calls at specific iteration."""
157 return [
158 call for call in self.recording["tool_calls"]
159 if call["iteration"] == iteration
160 ]
161
162 def step_forward(self) -> dict | None:
163 """Move to next snapshot."""
164 self.current_snapshot += 1
165 if self.current_snapshot < len(self.recording["snapshots"]):
166 return self.recording["snapshots"][self.current_snapshot]
167 return None
168
169 def step_backward(self) -> dict | None:
170 """Move to previous snapshot."""
171 if self.current_snapshot > 0:
172 self.current_snapshot -= 1
173 return self.recording["snapshots"][self.current_snapshot]
174 return None
175
176 def compare_snapshots(self, iter1: int, iter2: int) -> dict:
177 """Compare two snapshots."""
178 snap1 = self.get_snapshot(iter1)
179 snap2 = self.get_snapshot(iter2)
180
181 if not snap1 or not snap2:
182 return {"error": "Snapshot not found"}
183
184 # Compare messages
185 msg_diff = self._compare_lists(snap1["messages"], snap2["messages"])
186
187 # Compare memory
188 mem_diff = self._compare_dicts(snap1["memory_state"], snap2["memory_state"])
189
190 return {
191 "messages_added": msg_diff["added"],
192 "messages_removed": msg_diff["removed"],
193 "memory_changes": mem_diff
194 }
195
196 def _compare_lists(self, list1: list, list2: list) -> dict:
197 """Compare two lists."""
198 return {
199 "added": [x for x in list2 if x not in list1],
200 "removed": [x for x in list1 if x not in list2]
201 }
202
203 def _compare_dicts(self, dict1: dict, dict2: dict) -> dict:
204 """Compare two dictionaries."""
205 changes = {}
206 all_keys = set(dict1.keys()) | set(dict2.keys())
207
208 for key in all_keys:
209 v1 = dict1.get(key)
210 v2 = dict2.get(key)
211 if v1 != v2:
212 changes[key] = {"from": v1, "to": v2}
213
214 return changesInteractive Debugging
Step-Through Debugging
🐍python
1"""
2Interactive Agent Debugging
3
4Step through agent execution interactively.
5"""
6
7from dataclasses import dataclass
8from typing import Callable
9
10
11class InteractiveDebugger:
12 """Interactive debugger for agent execution."""
13
14 def __init__(self, agent):
15 self.agent = agent
16 self.breakpoints: list[Callable] = []
17 self.watches: dict[str, Callable] = {}
18 self.paused = False
19 self.step_mode = False
20
21 def add_breakpoint(self, condition: Callable[[dict], bool]):
22 """Add a conditional breakpoint."""
23 self.breakpoints.append(condition)
24
25 def add_watch(self, name: str, extractor: Callable[[dict], Any]):
26 """Add a watch expression."""
27 self.watches[name] = extractor
28
29 def check_breakpoint(self, state: dict) -> bool:
30 """Check if any breakpoint is hit."""
31 for bp in self.breakpoints:
32 if bp(state):
33 return True
34 return False
35
36 def get_watches(self, state: dict) -> dict:
37 """Evaluate all watch expressions."""
38 return {
39 name: extractor(state)
40 for name, extractor in self.watches.items()
41 }
42
43 def debug_iteration(self, state: dict):
44 """Called at each iteration for debugging."""
45 # Check breakpoints
46 if self.check_breakpoint(state):
47 self.paused = True
48 print("\n=== BREAKPOINT HIT ===")
49
50 # Step mode
51 if self.step_mode:
52 self.paused = True
53
54 # Display watches
55 watches = self.get_watches(state)
56 if watches:
57 print("\n--- Watch Values ---")
58 for name, value in watches.items():
59 print(f" {name}: {value}")
60
61 # Interactive prompt if paused
62 while self.paused:
63 self._interactive_prompt(state)
64
65 def _interactive_prompt(self, state: dict):
66 """Interactive debugging prompt."""
67 print("\n[Debugger] Commands: (c)ontinue, (s)tep, (p)rint, (m)emory, (h)istory, (q)uit")
68 cmd = input("> ").strip().lower()
69
70 if cmd == "c":
71 self.paused = False
72 self.step_mode = False
73 elif cmd == "s":
74 self.paused = False
75 self.step_mode = True
76 elif cmd == "p":
77 self._print_state(state)
78 elif cmd == "m":
79 self._print_memory(state)
80 elif cmd == "h":
81 self._print_history(state)
82 elif cmd == "q":
83 raise KeyboardInterrupt("Debug quit")
84 elif cmd.startswith("eval "):
85 self._eval_expression(cmd[5:], state)
86
87 def _print_state(self, state: dict):
88 """Print current agent state."""
89 print("\n--- Current State ---")
90 print(f"Iteration: {state.get('iteration', 'N/A')}")
91 print(f"Goal: {state.get('goal', 'N/A')[:100]}")
92 print(f"Last action: {state.get('last_action', 'N/A')}")
93 print(f"Pending: {len(state.get('pending_actions', []))} actions")
94
95 def _print_memory(self, state: dict):
96 """Print memory state."""
97 print("\n--- Memory State ---")
98 memory = state.get("memory", {})
99 for key, value in memory.items():
100 print(f" {key}: {str(value)[:100]}")
101
102 def _print_history(self, state: dict):
103 """Print action history."""
104 print("\n--- Action History ---")
105 history = state.get("action_history", [])[-5:]
106 for i, action in enumerate(history):
107 print(f" {i+1}. {action.get('type')}: {action.get('result', {}).get('success')}")
108
109 def _eval_expression(self, expr: str, state: dict):
110 """Evaluate a Python expression in context."""
111 try:
112 result = eval(expr, {"state": state})
113 print(f"=> {result}")
114 except Exception as e:
115 print(f"Error: {e}")
116
117
118# Usage example
119def create_debugger(agent):
120 debugger = InteractiveDebugger(agent)
121
122 # Break when agent loops
123 debugger.add_breakpoint(
124 lambda s: len(set(
125 a["type"] for a in s.get("action_history", [])[-3:]
126 )) == 1 if len(s.get("action_history", [])) >= 3 else False
127 )
128
129 # Break on error
130 debugger.add_breakpoint(
131 lambda s: s.get("last_result", {}).get("success") == False
132 )
133
134 # Watch important values
135 debugger.add_watch("goal", lambda s: s.get("goal", "")[:50])
136 debugger.add_watch("iteration", lambda s: s.get("iteration", 0))
137 debugger.add_watch("last_action", lambda s: s.get("last_action", {}).get("type"))
138
139 return debuggerDebugging Tools
Specialized Agent Debugging Tools
🐍python
1"""
2Agent Debugging Tools
3
4Specialized tools for debugging agent behavior.
5"""
6
7from dataclasses import dataclass
8from typing import Any
9
10
11class PromptInspector:
12 """Inspect and analyze prompts sent to LLM."""
13
14 def analyze_prompt(self, prompt: str | list[dict]) -> dict:
15 """Analyze a prompt for potential issues."""
16 issues = []
17
18 if isinstance(prompt, str):
19 text = prompt
20 else:
21 text = " ".join(m.get("content", "") for m in prompt)
22
23 # Check length
24 if len(text) > 10000:
25 issues.append({
26 "type": "length",
27 "message": f"Prompt is very long ({len(text)} chars)",
28 "suggestion": "Consider summarizing or truncating"
29 })
30
31 # Check for conflicting instructions
32 if "always" in text.lower() and "never" in text.lower():
33 issues.append({
34 "type": "conflict",
35 "message": "Prompt contains both 'always' and 'never'",
36 "suggestion": "Review for contradictory instructions"
37 })
38
39 # Check for vague language
40 vague_terms = ["maybe", "perhaps", "might", "could possibly"]
41 found_vague = [t for t in vague_terms if t in text.lower()]
42 if found_vague:
43 issues.append({
44 "type": "vagueness",
45 "message": f"Prompt contains vague terms: {found_vague}",
46 "suggestion": "Use more precise language"
47 })
48
49 return {
50 "length": len(text),
51 "word_count": len(text.split()),
52 "issues": issues
53 }
54
55
56class ResponseAnalyzer:
57 """Analyze LLM responses for issues."""
58
59 def analyze_response(self, response: str, expected_format: str = None) -> dict:
60 """Analyze an LLM response."""
61 issues = []
62
63 # Check for refusals
64 refusal_phrases = [
65 "I cannot", "I'm not able to", "I apologize",
66 "I'm sorry, but", "I don't have"
67 ]
68 for phrase in refusal_phrases:
69 if phrase.lower() in response.lower():
70 issues.append({
71 "type": "refusal",
72 "message": f"Response may be a refusal (contains '{phrase}')",
73 "snippet": response[:200]
74 })
75 break
76
77 # Check for uncertainty
78 uncertainty_phrases = ["I'm not sure", "I think", "possibly", "might be"]
79 for phrase in uncertainty_phrases:
80 if phrase.lower() in response.lower():
81 issues.append({
82 "type": "uncertainty",
83 "message": f"Response shows uncertainty ('{phrase}')"
84 })
85 break
86
87 # Check format if expected
88 if expected_format:
89 if expected_format == "json":
90 try:
91 import json
92 json.loads(response)
93 except:
94 issues.append({
95 "type": "format",
96 "message": "Expected JSON but response is not valid JSON"
97 })
98
99 return {
100 "length": len(response),
101 "issues": issues
102 }
103
104
105class DecisionTracer:
106 """Trace and explain agent decisions."""
107
108 def __init__(self):
109 self.decisions: list[dict] = []
110
111 def record_decision(
112 self,
113 decision_point: str,
114 options: list[str],
115 chosen: str,
116 reasoning: str,
117 confidence: float
118 ):
119 """Record a decision made by the agent."""
120 self.decisions.append({
121 "decision_point": decision_point,
122 "options": options,
123 "chosen": chosen,
124 "reasoning": reasoning,
125 "confidence": confidence
126 })
127
128 def explain_path(self) -> str:
129 """Generate explanation of decision path."""
130 lines = ["Decision Path Analysis:", "=" * 40]
131
132 for i, d in enumerate(self.decisions, 1):
133 lines.append(f"\n{i}. {d['decision_point']}")
134 lines.append(f" Options: {', '.join(d['options'])}")
135 lines.append(f" Chosen: {d['chosen']} (confidence: {d['confidence']:.0%})")
136 lines.append(f" Reasoning: {d['reasoning'][:100]}")
137
138 return "\n".join(lines)
139
140 def find_pivot_points(self) -> list[dict]:
141 """Find decisions that significantly changed direction."""
142 pivots = []
143
144 for i, d in enumerate(self.decisions):
145 # Low confidence decisions
146 if d["confidence"] < 0.5:
147 pivots.append({
148 "index": i,
149 "decision": d,
150 "reason": "low_confidence"
151 })
152
153 # First use of an option
154 if i > 0:
155 prev_options = set(self.decisions[i-1]["options"])
156 if d["chosen"] not in prev_options:
157 pivots.append({
158 "index": i,
159 "decision": d,
160 "reason": "new_option"
161 })
162
163 return pivots
164
165
166class DebugReport:
167 """Generate comprehensive debug reports."""
168
169 def __init__(self):
170 self.sections: list[dict] = []
171
172 def add_section(self, title: str, content: str):
173 """Add a section to the report."""
174 self.sections.append({"title": title, "content": content})
175
176 def generate(self) -> str:
177 """Generate the full report."""
178 lines = ["=" * 60, "AGENT DEBUG REPORT", "=" * 60]
179
180 for section in self.sections:
181 lines.append(f"\n## {section['title']}")
182 lines.append("-" * 40)
183 lines.append(section["content"])
184
185 return "\n".join(lines)Key Takeaways
- Automated issue detection catches common problems like loops, goal drift, and stuck states.
- Replay debugging allows you to reproduce and step through past executions.
- Interactive debugging provides step-through capabilities with breakpoints and watches.
- Specialized tools for prompt inspection, response analysis, and decision tracing aid diagnosis.
- Record everything - comprehensive logging makes debugging much easier.
Next Section Preview: We'll explore performance profiling for optimizing agent speed and resource usage.