Chapter 19
18 min read
Section 119 of 175

Debugging Agent Behavior

Observability and Debugging

Introduction

Debugging AI agents is uniquely challenging because their behavior is non-deterministic, context-dependent, and emerges from complex interactions between prompts, tools, and memory. This section covers techniques for understanding and fixing agent issues.

Section Overview: We'll explore common agent issues, replay debugging, interactive debugging techniques, and specialized debugging tools.

Common Agent Issues

Issue Categories

Issue TypeSymptomsCommon Causes
LoopsSame action repeatedUnclear goal, poor prompting
Goal driftGradual topic shiftLong context, weak constraints
HallucinationFalse informationKnowledge gaps, overconfidence
Tool misuseWrong tool selectionUnclear tool descriptions
Memory failuresForgets contextContext window limits
Stuck statesNo progressAmbiguous state, missing info
🐍python
1"""
2Common Agent Issues and Detection
3
4Automated detection of common agent problems.
5"""
6
7from dataclasses import dataclass
8from datetime import datetime, timedelta
9from typing import Any
10
11
12@dataclass
13class DebugIssue:
14    """A detected debugging issue."""
15    issue_type: str
16    severity: str  # low, medium, high, critical
17    description: str
18    evidence: list[dict]
19    suggested_fix: str
20
21
22class AgentDebugger:
23    """Detect and diagnose common agent issues."""
24
25    def __init__(self):
26        self.action_history: list[dict] = []
27        self.thought_history: list[str] = []
28        self.goal_history: list[str] = []
29
30    def record_action(self, action: dict):
31        """Record an agent action."""
32        self.action_history.append({
33            **action,
34            "timestamp": datetime.now()
35        })
36
37    def record_thought(self, thought: str):
38        """Record an agent thought."""
39        self.thought_history.append(thought)
40
41    def record_goal(self, goal: str):
42        """Record goal state."""
43        self.goal_history.append(goal)
44
45    def diagnose(self) -> list[DebugIssue]:
46        """Run all diagnostic checks."""
47        issues = []
48
49        issues.extend(self._check_loops())
50        issues.extend(self._check_goal_drift())
51        issues.extend(self._check_stuck_state())
52        issues.extend(self._check_tool_patterns())
53
54        return sorted(issues, key=lambda i: self._severity_order(i.severity))
55
56    def _check_loops(self) -> list[DebugIssue]:
57        """Detect action loops."""
58        issues = []
59
60        if len(self.action_history) < 3:
61            return issues
62
63        # Check for exact repeats
64        recent = self.action_history[-5:]
65        action_keys = [
66            f"{a['type']}:{str(a.get('input', ''))[:50]}"
67            for a in recent
68        ]
69
70        if len(set(action_keys)) == 1:
71            issues.append(DebugIssue(
72                issue_type="exact_loop",
73                severity="high",
74                description="Agent is repeating the exact same action",
75                evidence=[{"actions": action_keys}],
76                suggested_fix="Add loop detection in prompt or implement action deduplication"
77            ))
78
79        # Check for oscillation (A, B, A, B pattern)
80        if len(action_keys) >= 4:
81            if action_keys[-1] == action_keys[-3] and action_keys[-2] == action_keys[-4]:
82                issues.append(DebugIssue(
83                    issue_type="oscillation",
84                    severity="medium",
85                    description="Agent is oscillating between two actions",
86                    evidence=[{"pattern": action_keys[-4:]}],
87                    suggested_fix="Add state memory to track attempted approaches"
88                ))
89
90        return issues
91
92    def _check_goal_drift(self) -> list[DebugIssue]:
93        """Detect drift from original goal."""
94        issues = []
95
96        if len(self.goal_history) < 2:
97            return issues
98
99        original = self.goal_history[0]
100        current = self.goal_history[-1]
101
102        # Simple similarity check (use embeddings in production)
103        original_words = set(original.lower().split())
104        current_words = set(current.lower().split())
105
106        overlap = len(original_words & current_words)
107        total = len(original_words | current_words)
108        similarity = overlap / total if total > 0 else 0
109
110        if similarity < 0.3:
111            issues.append(DebugIssue(
112                issue_type="goal_drift",
113                severity="high",
114                description=f"Current goal has drifted significantly from original ({similarity:.0%} overlap)",
115                evidence=[
116                    {"original_goal": original},
117                    {"current_goal": current}
118                ],
119                suggested_fix="Add goal anchoring to prompts or periodic goal re-verification"
120            ))
121
122        return issues
123
124    def _check_stuck_state(self) -> list[DebugIssue]:
125        """Detect if agent is stuck."""
126        issues = []
127
128        if len(self.action_history) < 5:
129            return issues
130
131        # Check if no progress in last N actions
132        recent_results = [
133            a.get("result", {}).get("success", True)
134            for a in self.action_history[-5:]
135        ]
136
137        if all(not r for r in recent_results):
138            issues.append(DebugIssue(
139                issue_type="stuck_state",
140                severity="high",
141                description="Last 5 actions all failed - agent may be stuck",
142                evidence=[
143                    {"failed_actions": [a["type"] for a in self.action_history[-5:]]}
144                ],
145                suggested_fix="Implement failure recovery strategy or escalation"
146            ))
147
148        return issues
149
150    def _check_tool_patterns(self) -> list[DebugIssue]:
151        """Detect problematic tool usage patterns."""
152        issues = []
153
154        if len(self.action_history) < 3:
155            return issues
156
157        # Check for tool with consistently bad results
158        tool_results: dict[str, list[bool]] = {}
159        for action in self.action_history:
160            tool = action.get("type", "unknown")
161            success = action.get("result", {}).get("success", True)
162            if tool not in tool_results:
163                tool_results[tool] = []
164            tool_results[tool].append(success)
165
166        for tool, results in tool_results.items():
167            if len(results) >= 3 and not any(results[-3:]):
168                issues.append(DebugIssue(
169                    issue_type="tool_failure",
170                    severity="medium",
171                    description=f"Tool '{tool}' has failed last 3 attempts",
172                    evidence=[{"tool": tool, "failure_count": len(results)}],
173                    suggested_fix="Check tool configuration or add fallback tool"
174                ))
175
176        return issues
177
178    def _severity_order(self, severity: str) -> int:
179        return {"critical": 0, "high": 1, "medium": 2, "low": 3}.get(severity, 4)

Replay Debugging

Reproducing Agent Behavior

🐍python
1"""
2Replay Debugging
3
4Record and replay agent executions for debugging.
5"""
6
7from dataclasses import dataclass, field
8from datetime import datetime
9from typing import Any
10import json
11
12
13@dataclass
14class AgentSnapshot:
15    """Snapshot of agent state at a point in time."""
16    timestamp: datetime
17    iteration: int
18    messages: list[dict]
19    memory_state: dict
20    pending_actions: list[dict]
21    context: dict
22
23
24@dataclass
25class ExecutionRecording:
26    """Complete recording of an agent execution."""
27    recording_id: str
28    task: str
29    config: dict
30    snapshots: list[AgentSnapshot] = field(default_factory=list)
31    llm_calls: list[dict] = field(default_factory=list)
32    tool_calls: list[dict] = field(default_factory=list)
33    final_result: dict | None = None
34
35
36class AgentRecorder:
37    """Record agent execution for replay."""
38
39    def __init__(self, recording_id: str):
40        self.recording = ExecutionRecording(
41            recording_id=recording_id,
42            task="",
43            config={}
44        )
45        self.iteration = 0
46
47    def start_recording(self, task: str, config: dict):
48        """Start a new recording."""
49        self.recording.task = task
50        self.recording.config = config
51
52    def snapshot(
53        self,
54        messages: list[dict],
55        memory_state: dict,
56        pending_actions: list[dict],
57        context: dict
58    ):
59        """Take a snapshot of current state."""
60        self.iteration += 1
61        self.recording.snapshots.append(AgentSnapshot(
62            timestamp=datetime.now(),
63            iteration=self.iteration,
64            messages=messages.copy(),
65            memory_state=memory_state.copy(),
66            pending_actions=pending_actions.copy(),
67            context=context.copy()
68        ))
69
70    def record_llm_call(
71        self,
72        request: dict,
73        response: dict,
74        duration_ms: float
75    ):
76        """Record an LLM API call."""
77        self.recording.llm_calls.append({
78            "iteration": self.iteration,
79            "timestamp": datetime.now().isoformat(),
80            "request": request,
81            "response": response,
82            "duration_ms": duration_ms
83        })
84
85    def record_tool_call(
86        self,
87        tool_name: str,
88        input_data: dict,
89        output_data: dict,
90        success: bool
91    ):
92        """Record a tool call."""
93        self.recording.tool_calls.append({
94            "iteration": self.iteration,
95            "timestamp": datetime.now().isoformat(),
96            "tool": tool_name,
97            "input": input_data,
98            "output": output_data,
99            "success": success
100        })
101
102    def end_recording(self, result: dict):
103        """End the recording."""
104        self.recording.final_result = result
105
106    def save(self, path: str):
107        """Save recording to file."""
108        data = {
109            "recording_id": self.recording.recording_id,
110            "task": self.recording.task,
111            "config": self.recording.config,
112            "snapshots": [
113                {
114                    "timestamp": s.timestamp.isoformat(),
115                    "iteration": s.iteration,
116                    "messages": s.messages,
117                    "memory_state": s.memory_state,
118                    "pending_actions": s.pending_actions,
119                    "context": s.context
120                }
121                for s in self.recording.snapshots
122            ],
123            "llm_calls": self.recording.llm_calls,
124            "tool_calls": self.recording.tool_calls,
125            "final_result": self.recording.final_result
126        }
127        with open(path, "w") as f:
128            json.dump(data, f, indent=2)
129
130
131class AgentReplayer:
132    """Replay recorded agent executions."""
133
134    def __init__(self, recording_path: str):
135        with open(recording_path) as f:
136            data = json.load(f)
137
138        self.recording = data
139        self.current_snapshot = 0
140
141    def get_snapshot(self, iteration: int) -> dict | None:
142        """Get snapshot at specific iteration."""
143        for snapshot in self.recording["snapshots"]:
144            if snapshot["iteration"] == iteration:
145                return snapshot
146        return None
147
148    def get_llm_calls_at(self, iteration: int) -> list[dict]:
149        """Get LLM calls at specific iteration."""
150        return [
151            call for call in self.recording["llm_calls"]
152            if call["iteration"] == iteration
153        ]
154
155    def get_tool_calls_at(self, iteration: int) -> list[dict]:
156        """Get tool calls at specific iteration."""
157        return [
158            call for call in self.recording["tool_calls"]
159            if call["iteration"] == iteration
160        ]
161
162    def step_forward(self) -> dict | None:
163        """Move to next snapshot."""
164        self.current_snapshot += 1
165        if self.current_snapshot < len(self.recording["snapshots"]):
166            return self.recording["snapshots"][self.current_snapshot]
167        return None
168
169    def step_backward(self) -> dict | None:
170        """Move to previous snapshot."""
171        if self.current_snapshot > 0:
172            self.current_snapshot -= 1
173            return self.recording["snapshots"][self.current_snapshot]
174        return None
175
176    def compare_snapshots(self, iter1: int, iter2: int) -> dict:
177        """Compare two snapshots."""
178        snap1 = self.get_snapshot(iter1)
179        snap2 = self.get_snapshot(iter2)
180
181        if not snap1 or not snap2:
182            return {"error": "Snapshot not found"}
183
184        # Compare messages
185        msg_diff = self._compare_lists(snap1["messages"], snap2["messages"])
186
187        # Compare memory
188        mem_diff = self._compare_dicts(snap1["memory_state"], snap2["memory_state"])
189
190        return {
191            "messages_added": msg_diff["added"],
192            "messages_removed": msg_diff["removed"],
193            "memory_changes": mem_diff
194        }
195
196    def _compare_lists(self, list1: list, list2: list) -> dict:
197        """Compare two lists."""
198        return {
199            "added": [x for x in list2 if x not in list1],
200            "removed": [x for x in list1 if x not in list2]
201        }
202
203    def _compare_dicts(self, dict1: dict, dict2: dict) -> dict:
204        """Compare two dictionaries."""
205        changes = {}
206        all_keys = set(dict1.keys()) | set(dict2.keys())
207
208        for key in all_keys:
209            v1 = dict1.get(key)
210            v2 = dict2.get(key)
211            if v1 != v2:
212                changes[key] = {"from": v1, "to": v2}
213
214        return changes

Interactive Debugging

Step-Through Debugging

🐍python
1"""
2Interactive Agent Debugging
3
4Step through agent execution interactively.
5"""
6
7from dataclasses import dataclass
8from typing import Callable
9
10
11class InteractiveDebugger:
12    """Interactive debugger for agent execution."""
13
14    def __init__(self, agent):
15        self.agent = agent
16        self.breakpoints: list[Callable] = []
17        self.watches: dict[str, Callable] = {}
18        self.paused = False
19        self.step_mode = False
20
21    def add_breakpoint(self, condition: Callable[[dict], bool]):
22        """Add a conditional breakpoint."""
23        self.breakpoints.append(condition)
24
25    def add_watch(self, name: str, extractor: Callable[[dict], Any]):
26        """Add a watch expression."""
27        self.watches[name] = extractor
28
29    def check_breakpoint(self, state: dict) -> bool:
30        """Check if any breakpoint is hit."""
31        for bp in self.breakpoints:
32            if bp(state):
33                return True
34        return False
35
36    def get_watches(self, state: dict) -> dict:
37        """Evaluate all watch expressions."""
38        return {
39            name: extractor(state)
40            for name, extractor in self.watches.items()
41        }
42
43    def debug_iteration(self, state: dict):
44        """Called at each iteration for debugging."""
45        # Check breakpoints
46        if self.check_breakpoint(state):
47            self.paused = True
48            print("\n=== BREAKPOINT HIT ===")
49
50        # Step mode
51        if self.step_mode:
52            self.paused = True
53
54        # Display watches
55        watches = self.get_watches(state)
56        if watches:
57            print("\n--- Watch Values ---")
58            for name, value in watches.items():
59                print(f"  {name}: {value}")
60
61        # Interactive prompt if paused
62        while self.paused:
63            self._interactive_prompt(state)
64
65    def _interactive_prompt(self, state: dict):
66        """Interactive debugging prompt."""
67        print("\n[Debugger] Commands: (c)ontinue, (s)tep, (p)rint, (m)emory, (h)istory, (q)uit")
68        cmd = input("> ").strip().lower()
69
70        if cmd == "c":
71            self.paused = False
72            self.step_mode = False
73        elif cmd == "s":
74            self.paused = False
75            self.step_mode = True
76        elif cmd == "p":
77            self._print_state(state)
78        elif cmd == "m":
79            self._print_memory(state)
80        elif cmd == "h":
81            self._print_history(state)
82        elif cmd == "q":
83            raise KeyboardInterrupt("Debug quit")
84        elif cmd.startswith("eval "):
85            self._eval_expression(cmd[5:], state)
86
87    def _print_state(self, state: dict):
88        """Print current agent state."""
89        print("\n--- Current State ---")
90        print(f"Iteration: {state.get('iteration', 'N/A')}")
91        print(f"Goal: {state.get('goal', 'N/A')[:100]}")
92        print(f"Last action: {state.get('last_action', 'N/A')}")
93        print(f"Pending: {len(state.get('pending_actions', []))} actions")
94
95    def _print_memory(self, state: dict):
96        """Print memory state."""
97        print("\n--- Memory State ---")
98        memory = state.get("memory", {})
99        for key, value in memory.items():
100            print(f"  {key}: {str(value)[:100]}")
101
102    def _print_history(self, state: dict):
103        """Print action history."""
104        print("\n--- Action History ---")
105        history = state.get("action_history", [])[-5:]
106        for i, action in enumerate(history):
107            print(f"  {i+1}. {action.get('type')}: {action.get('result', {}).get('success')}")
108
109    def _eval_expression(self, expr: str, state: dict):
110        """Evaluate a Python expression in context."""
111        try:
112            result = eval(expr, {"state": state})
113            print(f"=> {result}")
114        except Exception as e:
115            print(f"Error: {e}")
116
117
118# Usage example
119def create_debugger(agent):
120    debugger = InteractiveDebugger(agent)
121
122    # Break when agent loops
123    debugger.add_breakpoint(
124        lambda s: len(set(
125            a["type"] for a in s.get("action_history", [])[-3:]
126        )) == 1 if len(s.get("action_history", [])) >= 3 else False
127    )
128
129    # Break on error
130    debugger.add_breakpoint(
131        lambda s: s.get("last_result", {}).get("success") == False
132    )
133
134    # Watch important values
135    debugger.add_watch("goal", lambda s: s.get("goal", "")[:50])
136    debugger.add_watch("iteration", lambda s: s.get("iteration", 0))
137    debugger.add_watch("last_action", lambda s: s.get("last_action", {}).get("type"))
138
139    return debugger

Debugging Tools

Specialized Agent Debugging Tools

🐍python
1"""
2Agent Debugging Tools
3
4Specialized tools for debugging agent behavior.
5"""
6
7from dataclasses import dataclass
8from typing import Any
9
10
11class PromptInspector:
12    """Inspect and analyze prompts sent to LLM."""
13
14    def analyze_prompt(self, prompt: str | list[dict]) -> dict:
15        """Analyze a prompt for potential issues."""
16        issues = []
17
18        if isinstance(prompt, str):
19            text = prompt
20        else:
21            text = " ".join(m.get("content", "") for m in prompt)
22
23        # Check length
24        if len(text) > 10000:
25            issues.append({
26                "type": "length",
27                "message": f"Prompt is very long ({len(text)} chars)",
28                "suggestion": "Consider summarizing or truncating"
29            })
30
31        # Check for conflicting instructions
32        if "always" in text.lower() and "never" in text.lower():
33            issues.append({
34                "type": "conflict",
35                "message": "Prompt contains both 'always' and 'never'",
36                "suggestion": "Review for contradictory instructions"
37            })
38
39        # Check for vague language
40        vague_terms = ["maybe", "perhaps", "might", "could possibly"]
41        found_vague = [t for t in vague_terms if t in text.lower()]
42        if found_vague:
43            issues.append({
44                "type": "vagueness",
45                "message": f"Prompt contains vague terms: {found_vague}",
46                "suggestion": "Use more precise language"
47            })
48
49        return {
50            "length": len(text),
51            "word_count": len(text.split()),
52            "issues": issues
53        }
54
55
56class ResponseAnalyzer:
57    """Analyze LLM responses for issues."""
58
59    def analyze_response(self, response: str, expected_format: str = None) -> dict:
60        """Analyze an LLM response."""
61        issues = []
62
63        # Check for refusals
64        refusal_phrases = [
65            "I cannot", "I'm not able to", "I apologize",
66            "I'm sorry, but", "I don't have"
67        ]
68        for phrase in refusal_phrases:
69            if phrase.lower() in response.lower():
70                issues.append({
71                    "type": "refusal",
72                    "message": f"Response may be a refusal (contains '{phrase}')",
73                    "snippet": response[:200]
74                })
75                break
76
77        # Check for uncertainty
78        uncertainty_phrases = ["I'm not sure", "I think", "possibly", "might be"]
79        for phrase in uncertainty_phrases:
80            if phrase.lower() in response.lower():
81                issues.append({
82                    "type": "uncertainty",
83                    "message": f"Response shows uncertainty ('{phrase}')"
84                })
85                break
86
87        # Check format if expected
88        if expected_format:
89            if expected_format == "json":
90                try:
91                    import json
92                    json.loads(response)
93                except:
94                    issues.append({
95                        "type": "format",
96                        "message": "Expected JSON but response is not valid JSON"
97                    })
98
99        return {
100            "length": len(response),
101            "issues": issues
102        }
103
104
105class DecisionTracer:
106    """Trace and explain agent decisions."""
107
108    def __init__(self):
109        self.decisions: list[dict] = []
110
111    def record_decision(
112        self,
113        decision_point: str,
114        options: list[str],
115        chosen: str,
116        reasoning: str,
117        confidence: float
118    ):
119        """Record a decision made by the agent."""
120        self.decisions.append({
121            "decision_point": decision_point,
122            "options": options,
123            "chosen": chosen,
124            "reasoning": reasoning,
125            "confidence": confidence
126        })
127
128    def explain_path(self) -> str:
129        """Generate explanation of decision path."""
130        lines = ["Decision Path Analysis:", "=" * 40]
131
132        for i, d in enumerate(self.decisions, 1):
133            lines.append(f"\n{i}. {d['decision_point']}")
134            lines.append(f"   Options: {', '.join(d['options'])}")
135            lines.append(f"   Chosen: {d['chosen']} (confidence: {d['confidence']:.0%})")
136            lines.append(f"   Reasoning: {d['reasoning'][:100]}")
137
138        return "\n".join(lines)
139
140    def find_pivot_points(self) -> list[dict]:
141        """Find decisions that significantly changed direction."""
142        pivots = []
143
144        for i, d in enumerate(self.decisions):
145            # Low confidence decisions
146            if d["confidence"] < 0.5:
147                pivots.append({
148                    "index": i,
149                    "decision": d,
150                    "reason": "low_confidence"
151                })
152
153            # First use of an option
154            if i > 0:
155                prev_options = set(self.decisions[i-1]["options"])
156                if d["chosen"] not in prev_options:
157                    pivots.append({
158                        "index": i,
159                        "decision": d,
160                        "reason": "new_option"
161                    })
162
163        return pivots
164
165
166class DebugReport:
167    """Generate comprehensive debug reports."""
168
169    def __init__(self):
170        self.sections: list[dict] = []
171
172    def add_section(self, title: str, content: str):
173        """Add a section to the report."""
174        self.sections.append({"title": title, "content": content})
175
176    def generate(self) -> str:
177        """Generate the full report."""
178        lines = ["=" * 60, "AGENT DEBUG REPORT", "=" * 60]
179
180        for section in self.sections:
181            lines.append(f"\n## {section['title']}")
182            lines.append("-" * 40)
183            lines.append(section["content"])
184
185        return "\n".join(lines)

Key Takeaways

  • Automated issue detection catches common problems like loops, goal drift, and stuck states.
  • Replay debugging allows you to reproduce and step through past executions.
  • Interactive debugging provides step-through capabilities with breakpoints and watches.
  • Specialized tools for prompt inspection, response analysis, and decision tracing aid diagnosis.
  • Record everything - comprehensive logging makes debugging much easier.
Next Section Preview: We'll explore performance profiling for optimizing agent speed and resource usage.