Introduction
Distributed tracing allows you to follow a request as it flows through your agent system, including LLM calls, tool invocations, and memory operations. This visibility is essential for understanding agent behavior and diagnosing issues.
Section Overview: We'll cover distributed tracing concepts, span creation, agent-specific tracing patterns, and trace visualization techniques.
Distributed Tracing Basics
Core Concepts
| Concept | Description | Example |
|---|---|---|
| Trace | End-to-end request flow | User query to response |
| Span | Single operation within trace | LLM call, tool use |
| Parent/Child | Span hierarchy | Task contains multiple steps |
| Baggage | Data propagated across spans | User ID, agent config |
| Attributes | Span metadata | Duration, status, errors |
🐍python
1"""
2Distributed Tracing for AI Agents
3
4OpenTelemetry-compatible tracing implementation.
5"""
6
7from dataclasses import dataclass, field
8from datetime import datetime
9from enum import Enum
10from typing import Any, Optional
11import uuid
12import time
13
14
15class SpanStatus(Enum):
16 UNSET = "unset"
17 OK = "ok"
18 ERROR = "error"
19
20
21@dataclass
22class SpanEvent:
23 """Event within a span."""
24 name: str
25 timestamp: datetime
26 attributes: dict = field(default_factory=dict)
27
28
29@dataclass
30class Span:
31 """A single span in a trace."""
32 trace_id: str
33 span_id: str
34 name: str
35 parent_span_id: str | None
36 start_time: datetime
37 end_time: datetime | None = None
38 status: SpanStatus = SpanStatus.UNSET
39 attributes: dict = field(default_factory=dict)
40 events: list[SpanEvent] = field(default_factory=list)
41 baggage: dict = field(default_factory=dict)
42
43 @property
44 def duration_ms(self) -> float | None:
45 if self.end_time:
46 return (self.end_time - self.start_time).total_seconds() * 1000
47 return None
48
49 def set_attribute(self, key: str, value: Any):
50 """Set a span attribute."""
51 self.attributes[key] = value
52
53 def add_event(self, name: str, **attributes):
54 """Add an event to the span."""
55 self.events.append(SpanEvent(
56 name=name,
57 timestamp=datetime.utcnow(),
58 attributes=attributes
59 ))
60
61 def set_status(self, status: SpanStatus, description: str = ""):
62 """Set span status."""
63 self.status = status
64 if description:
65 self.attributes["status_description"] = description
66
67 def end(self):
68 """End the span."""
69 self.end_time = datetime.utcnow()
70
71
72class Trace:
73 """A complete trace with all spans."""
74
75 def __init__(self, trace_id: str | None = None):
76 self.trace_id = trace_id or str(uuid.uuid4())
77 self.spans: list[Span] = []
78 self.baggage: dict = {}
79
80 def add_span(self, span: Span):
81 """Add a span to the trace."""
82 self.spans.append(span)
83
84 def get_root_span(self) -> Span | None:
85 """Get the root span."""
86 for span in self.spans:
87 if span.parent_span_id is None:
88 return span
89 return None
90
91 def get_span_tree(self) -> dict:
92 """Get spans as a tree structure."""
93 span_map = {s.span_id: s for s in self.spans}
94 children: dict[str, list] = {}
95
96 for span in self.spans:
97 parent_id = span.parent_span_id or "root"
98 if parent_id not in children:
99 children[parent_id] = []
100 children[parent_id].append(span)
101
102 def build_tree(span_id: str) -> dict:
103 span = span_map.get(span_id)
104 if not span:
105 return {}
106 return {
107 "span": span,
108 "children": [
109 build_tree(child.span_id)
110 for child in children.get(span_id, [])
111 ]
112 }
113
114 root = self.get_root_span()
115 if root:
116 return build_tree(root.span_id)
117 return {}
118
119
120class Tracer:
121 """Create and manage traces and spans."""
122
123 def __init__(self, service_name: str):
124 self.service_name = service_name
125 self.current_trace: Trace | None = None
126 self.span_stack: list[Span] = []
127 self.exporters: list = []
128
129 def start_trace(self, name: str, **attributes) -> Trace:
130 """Start a new trace."""
131 self.current_trace = Trace()
132 root_span = self.start_span(name, **attributes)
133 return self.current_trace
134
135 def start_span(
136 self,
137 name: str,
138 parent: Span | None = None,
139 **attributes
140 ) -> Span:
141 """Start a new span."""
142 if not self.current_trace:
143 self.current_trace = Trace()
144
145 parent_id = None
146 if parent:
147 parent_id = parent.span_id
148 elif self.span_stack:
149 parent_id = self.span_stack[-1].span_id
150
151 span = Span(
152 trace_id=self.current_trace.trace_id,
153 span_id=str(uuid.uuid4()),
154 name=name,
155 parent_span_id=parent_id,
156 start_time=datetime.utcnow(),
157 attributes={
158 "service.name": self.service_name,
159 **attributes
160 },
161 baggage=self.current_trace.baggage.copy()
162 )
163
164 self.current_trace.add_span(span)
165 self.span_stack.append(span)
166
167 return span
168
169 def end_span(self, span: Span | None = None):
170 """End a span."""
171 if span:
172 span.end()
173 if span in self.span_stack:
174 self.span_stack.remove(span)
175 elif self.span_stack:
176 self.span_stack[-1].end()
177 self.span_stack.pop()
178
179 def get_current_span(self) -> Span | None:
180 """Get the current active span."""
181 return self.span_stack[-1] if self.span_stack else None
182
183 def add_exporter(self, exporter):
184 """Add a trace exporter."""
185 self.exporters.append(exporter)
186
187 def export_trace(self):
188 """Export the current trace."""
189 if self.current_trace:
190 for exporter in self.exporters:
191 exporter.export(self.current_trace)Creating Spans
Span Decorator Pattern
🐍python
1"""
2Span Creation Patterns
3
4Convenient ways to create spans around operations.
5"""
6
7from contextlib import contextmanager
8from functools import wraps
9from typing import Callable, Generator
10
11
12def traced(
13 name: str | None = None,
14 attributes: dict | None = None
15):
16 """Decorator to trace a function."""
17 def decorator(func: Callable) -> Callable:
18 span_name = name or func.__name__
19
20 @wraps(func)
21 async def async_wrapper(*args, **kwargs):
22 tracer = get_tracer() # Get global tracer
23 span = tracer.start_span(
24 span_name,
25 **{
26 "function.name": func.__name__,
27 "function.module": func.__module__,
28 **(attributes or {})
29 }
30 )
31 try:
32 result = await func(*args, **kwargs)
33 span.set_status(SpanStatus.OK)
34 return result
35 except Exception as e:
36 span.set_status(SpanStatus.ERROR, str(e))
37 span.add_event("exception", type=type(e).__name__, message=str(e))
38 raise
39 finally:
40 tracer.end_span(span)
41
42 @wraps(func)
43 def sync_wrapper(*args, **kwargs):
44 tracer = get_tracer()
45 span = tracer.start_span(
46 span_name,
47 **{
48 "function.name": func.__name__,
49 "function.module": func.__module__,
50 **(attributes or {})
51 }
52 )
53 try:
54 result = func(*args, **kwargs)
55 span.set_status(SpanStatus.OK)
56 return result
57 except Exception as e:
58 span.set_status(SpanStatus.ERROR, str(e))
59 span.add_event("exception", type=type(e).__name__, message=str(e))
60 raise
61 finally:
62 tracer.end_span(span)
63
64 import asyncio
65 if asyncio.iscoroutinefunction(func):
66 return async_wrapper
67 return sync_wrapper
68
69 return decorator
70
71
72@contextmanager
73def trace_span(
74 name: str,
75 **attributes
76) -> Generator[Span, None, None]:
77 """Context manager for creating spans."""
78 tracer = get_tracer()
79 span = tracer.start_span(name, **attributes)
80 try:
81 yield span
82 except Exception as e:
83 span.set_status(SpanStatus.ERROR, str(e))
84 span.add_event("exception", type=type(e).__name__, message=str(e))
85 raise
86 finally:
87 span.set_status(SpanStatus.OK if span.status == SpanStatus.UNSET else span.status)
88 tracer.end_span(span)
89
90
91# Usage examples
92@traced("llm_call", attributes={"model": "gpt-4"})
93async def call_llm(prompt: str) -> str:
94 """LLM call with automatic tracing."""
95 # LLM call implementation
96 return "response"
97
98
99@traced("tool_execution")
100def execute_tool(tool_name: str, params: dict) -> dict:
101 """Tool execution with automatic tracing."""
102 # Tool execution implementation
103 return {"result": "success"}
104
105
106async def process_request(request: str):
107 """Example of manual span creation."""
108 with trace_span("process_request", request_length=len(request)) as span:
109 span.add_event("request_received")
110
111 # Add custom attributes
112 span.set_attribute("request.type", "query")
113
114 with trace_span("validate_input") as validation_span:
115 is_valid = validate(request)
116 validation_span.set_attribute("valid", is_valid)
117
118 with trace_span("generate_response") as response_span:
119 response = await generate(request)
120 response_span.set_attribute("response.length", len(response))
121
122 return responseAgent-Specific Tracing
Tracing Agent Components
🐍python
1"""
2Agent-Specific Tracing
3
4Trace patterns for common agent operations.
5"""
6
7from dataclasses import dataclass
8from typing import Any
9
10
11class AgentTracer:
12 """Tracer specialized for agent operations."""
13
14 def __init__(self, base_tracer: Tracer):
15 self.tracer = base_tracer
16
17 @contextmanager
18 def trace_agent_run(self, task: str, agent_config: dict):
19 """Trace an entire agent run."""
20 with trace_span(
21 "agent_run",
22 task_preview=task[:100],
23 agent_type=agent_config.get("type"),
24 model=agent_config.get("model")
25 ) as span:
26 span.add_event("agent_started", config=agent_config)
27 yield span
28 span.add_event("agent_completed")
29
30 @contextmanager
31 def trace_llm_call(
32 self,
33 model: str,
34 prompt_tokens: int,
35 temperature: float
36 ):
37 """Trace an LLM API call."""
38 with trace_span(
39 "llm_call",
40 **{
41 "llm.model": model,
42 "llm.prompt_tokens": prompt_tokens,
43 "llm.temperature": temperature
44 }
45 ) as span:
46 start = time.time()
47 yield span
48 span.set_attribute("llm.duration_ms", (time.time() - start) * 1000)
49
50 @contextmanager
51 def trace_tool_call(
52 self,
53 tool_name: str,
54 tool_input: dict
55 ):
56 """Trace a tool invocation."""
57 with trace_span(
58 f"tool:{tool_name}",
59 **{
60 "tool.name": tool_name,
61 "tool.input_size": len(str(tool_input))
62 }
63 ) as span:
64 span.add_event("tool_invoked", input_keys=list(tool_input.keys()))
65 yield span
66
67 @contextmanager
68 def trace_memory_operation(
69 self,
70 operation: str,
71 memory_type: str
72 ):
73 """Trace a memory operation."""
74 with trace_span(
75 f"memory:{operation}",
76 **{
77 "memory.operation": operation,
78 "memory.type": memory_type
79 }
80 ) as span:
81 yield span
82
83 @contextmanager
84 def trace_decision(
85 self,
86 decision_type: str,
87 options: list[str]
88 ):
89 """Trace an agent decision."""
90 with trace_span(
91 f"decision:{decision_type}",
92 **{
93 "decision.type": decision_type,
94 "decision.option_count": len(options)
95 }
96 ) as span:
97 yield span
98
99
100class TracedAgentExecutor:
101 """Agent executor with comprehensive tracing."""
102
103 def __init__(self, agent_tracer: AgentTracer):
104 self.tracer = agent_tracer
105 self.iteration_count = 0
106
107 async def run(self, task: str, config: dict) -> dict:
108 """Run agent with full tracing."""
109 with self.tracer.trace_agent_run(task, config) as root_span:
110 root_span.set_attribute("task.length", len(task))
111
112 iterations = []
113 max_iterations = config.get("max_iterations", 10)
114
115 while self.iteration_count < max_iterations:
116 self.iteration_count += 1
117
118 with trace_span(
119 f"iteration_{self.iteration_count}",
120 iteration=self.iteration_count
121 ) as iter_span:
122
123 # Think
124 with self.tracer.trace_llm_call(
125 model=config["model"],
126 prompt_tokens=1000,
127 temperature=0.7
128 ) as llm_span:
129 thought = await self._think(task)
130 llm_span.set_attribute("llm.completion_tokens", 200)
131
132 # Decide
133 with self.tracer.trace_decision(
134 "action_selection",
135 options=["search", "analyze", "respond"]
136 ) as decision_span:
137 action = await self._decide(thought)
138 decision_span.set_attribute("decision.chosen", action["type"])
139
140 # Act
141 if action["type"] == "respond":
142 iter_span.add_event("task_complete")
143 break
144
145 with self.tracer.trace_tool_call(
146 action["type"],
147 action.get("input", {})
148 ) as tool_span:
149 result = await self._act(action)
150 tool_span.set_attribute("tool.success", result.get("success"))
151
152 iterations.append({
153 "thought": thought,
154 "action": action,
155 "result": result
156 })
157
158 root_span.set_attribute("total_iterations", self.iteration_count)
159 return {"iterations": iterations, "task": task}
160
161 async def _think(self, context: str) -> str:
162 return "Thinking about the task..."
163
164 async def _decide(self, thought: str) -> dict:
165 return {"type": "search", "input": {"query": "test"}}
166
167 async def _act(self, action: dict) -> dict:
168 return {"success": True, "data": "result"}Trace Visualization
Rendering Traces
🐍python
1"""
2Trace Visualization
3
4Generate visual representations of traces.
5"""
6
7from dataclasses import dataclass
8
9
10class TraceRenderer:
11 """Render traces for visualization."""
12
13 def to_timeline(self, trace: Trace) -> str:
14 """Render trace as ASCII timeline."""
15 if not trace.spans:
16 return "Empty trace"
17
18 lines = []
19 root = trace.get_root_span()
20 if not root:
21 return "No root span"
22
23 # Find time bounds
24 min_time = min(s.start_time for s in trace.spans)
25 max_time = max(s.end_time or s.start_time for s in trace.spans)
26 total_duration = (max_time - min_time).total_seconds() * 1000
27
28 def render_span(span: Span, depth: int = 0):
29 indent = " " * depth
30 start_offset = (span.start_time - min_time).total_seconds() * 1000
31 duration = span.duration_ms or 0
32
33 # Calculate bar position
34 bar_width = 50
35 start_pos = int((start_offset / total_duration) * bar_width) if total_duration > 0 else 0
36 end_pos = int(((start_offset + duration) / total_duration) * bar_width) if total_duration > 0 else 0
37 bar_length = max(1, end_pos - start_pos)
38
39 bar = " " * start_pos + "=" * bar_length
40 status_icon = "+" if span.status == SpanStatus.OK else "x" if span.status == SpanStatus.ERROR else "?"
41
42 lines.append(
43 f"{indent}{status_icon} {span.name} [{duration:.1f}ms]"
44 )
45 lines.append(f"{indent} |{bar}|")
46
47 # Render children
48 children = [s for s in trace.spans if s.parent_span_id == span.span_id]
49 for child in sorted(children, key=lambda s: s.start_time):
50 render_span(child, depth + 1)
51
52 render_span(root)
53 return "\n".join(lines)
54
55 def to_json(self, trace: Trace) -> dict:
56 """Export trace as JSON for visualization tools."""
57 return {
58 "trace_id": trace.trace_id,
59 "spans": [
60 {
61 "span_id": s.span_id,
62 "parent_span_id": s.parent_span_id,
63 "name": s.name,
64 "start_time": s.start_time.isoformat(),
65 "end_time": s.end_time.isoformat() if s.end_time else None,
66 "duration_ms": s.duration_ms,
67 "status": s.status.value,
68 "attributes": s.attributes,
69 "events": [
70 {
71 "name": e.name,
72 "timestamp": e.timestamp.isoformat(),
73 "attributes": e.attributes
74 }
75 for e in s.events
76 ]
77 }
78 for s in trace.spans
79 ]
80 }
81
82 def to_jaeger_format(self, trace: Trace) -> dict:
83 """Export in Jaeger-compatible format."""
84 processes = {
85 "p1": {
86 "serviceName": trace.spans[0].attributes.get("service.name", "unknown")
87 if trace.spans else "unknown"
88 }
89 }
90
91 spans = []
92 for s in trace.spans:
93 spans.append({
94 "traceID": trace.trace_id,
95 "spanID": s.span_id,
96 "operationName": s.name,
97 "references": [
98 {
99 "refType": "CHILD_OF",
100 "traceID": trace.trace_id,
101 "spanID": s.parent_span_id
102 }
103 ] if s.parent_span_id else [],
104 "startTime": int(s.start_time.timestamp() * 1_000_000),
105 "duration": int(s.duration_ms * 1000) if s.duration_ms else 0,
106 "tags": [
107 {"key": k, "value": v, "type": "string"}
108 for k, v in s.attributes.items()
109 ],
110 "logs": [
111 {
112 "timestamp": int(e.timestamp.timestamp() * 1_000_000),
113 "fields": [
114 {"key": "event", "value": e.name, "type": "string"},
115 *[
116 {"key": k, "value": str(v), "type": "string"}
117 for k, v in e.attributes.items()
118 ]
119 ]
120 }
121 for e in s.events
122 ],
123 "processID": "p1"
124 })
125
126 return {
127 "data": [{
128 "traceID": trace.trace_id,
129 "spans": spans,
130 "processes": processes
131 }]
132 }
133
134
135# Example usage
136renderer = TraceRenderer()
137
138# Create a sample trace
139tracer = Tracer("agent_service")
140tracer.start_trace("process_query")
141
142with trace_span("validate") as s:
143 s.add_event("validation_passed")
144
145with trace_span("llm_call") as s:
146 s.set_attribute("model", "gpt-4")
147 with trace_span("generate") as inner:
148 inner.set_attribute("tokens", 500)
149
150tracer.end_span()
151
152# Render
153print(renderer.to_timeline(tracer.current_trace))Key Takeaways
- Traces provide end-to-end visibility into agent request flows across all components.
- Spans represent individual operations with timing, status, and custom attributes.
- Use decorators and context managers for clean, consistent span creation.
- Agent-specific tracing patterns capture LLM calls, tool usage, memory operations, and decisions.
- Visualization tools help understand complex agent behavior and identify bottlenecks.
Next Section Preview: We'll explore debugging techniques for understanding and fixing agent behavior issues.