Chapter 19
18 min read
Section 118 of 175

Tracing and Spans

Observability and Debugging

Introduction

Distributed tracing allows you to follow a request as it flows through your agent system, including LLM calls, tool invocations, and memory operations. This visibility is essential for understanding agent behavior and diagnosing issues.

Section Overview: We'll cover distributed tracing concepts, span creation, agent-specific tracing patterns, and trace visualization techniques.

Distributed Tracing Basics

Core Concepts

ConceptDescriptionExample
TraceEnd-to-end request flowUser query to response
SpanSingle operation within traceLLM call, tool use
Parent/ChildSpan hierarchyTask contains multiple steps
BaggageData propagated across spansUser ID, agent config
AttributesSpan metadataDuration, status, errors
🐍python
1"""
2Distributed Tracing for AI Agents
3
4OpenTelemetry-compatible tracing implementation.
5"""
6
7from dataclasses import dataclass, field
8from datetime import datetime
9from enum import Enum
10from typing import Any, Optional
11import uuid
12import time
13
14
15class SpanStatus(Enum):
16    UNSET = "unset"
17    OK = "ok"
18    ERROR = "error"
19
20
21@dataclass
22class SpanEvent:
23    """Event within a span."""
24    name: str
25    timestamp: datetime
26    attributes: dict = field(default_factory=dict)
27
28
29@dataclass
30class Span:
31    """A single span in a trace."""
32    trace_id: str
33    span_id: str
34    name: str
35    parent_span_id: str | None
36    start_time: datetime
37    end_time: datetime | None = None
38    status: SpanStatus = SpanStatus.UNSET
39    attributes: dict = field(default_factory=dict)
40    events: list[SpanEvent] = field(default_factory=list)
41    baggage: dict = field(default_factory=dict)
42
43    @property
44    def duration_ms(self) -> float | None:
45        if self.end_time:
46            return (self.end_time - self.start_time).total_seconds() * 1000
47        return None
48
49    def set_attribute(self, key: str, value: Any):
50        """Set a span attribute."""
51        self.attributes[key] = value
52
53    def add_event(self, name: str, **attributes):
54        """Add an event to the span."""
55        self.events.append(SpanEvent(
56            name=name,
57            timestamp=datetime.utcnow(),
58            attributes=attributes
59        ))
60
61    def set_status(self, status: SpanStatus, description: str = ""):
62        """Set span status."""
63        self.status = status
64        if description:
65            self.attributes["status_description"] = description
66
67    def end(self):
68        """End the span."""
69        self.end_time = datetime.utcnow()
70
71
72class Trace:
73    """A complete trace with all spans."""
74
75    def __init__(self, trace_id: str | None = None):
76        self.trace_id = trace_id or str(uuid.uuid4())
77        self.spans: list[Span] = []
78        self.baggage: dict = {}
79
80    def add_span(self, span: Span):
81        """Add a span to the trace."""
82        self.spans.append(span)
83
84    def get_root_span(self) -> Span | None:
85        """Get the root span."""
86        for span in self.spans:
87            if span.parent_span_id is None:
88                return span
89        return None
90
91    def get_span_tree(self) -> dict:
92        """Get spans as a tree structure."""
93        span_map = {s.span_id: s for s in self.spans}
94        children: dict[str, list] = {}
95
96        for span in self.spans:
97            parent_id = span.parent_span_id or "root"
98            if parent_id not in children:
99                children[parent_id] = []
100            children[parent_id].append(span)
101
102        def build_tree(span_id: str) -> dict:
103            span = span_map.get(span_id)
104            if not span:
105                return {}
106            return {
107                "span": span,
108                "children": [
109                    build_tree(child.span_id)
110                    for child in children.get(span_id, [])
111                ]
112            }
113
114        root = self.get_root_span()
115        if root:
116            return build_tree(root.span_id)
117        return {}
118
119
120class Tracer:
121    """Create and manage traces and spans."""
122
123    def __init__(self, service_name: str):
124        self.service_name = service_name
125        self.current_trace: Trace | None = None
126        self.span_stack: list[Span] = []
127        self.exporters: list = []
128
129    def start_trace(self, name: str, **attributes) -> Trace:
130        """Start a new trace."""
131        self.current_trace = Trace()
132        root_span = self.start_span(name, **attributes)
133        return self.current_trace
134
135    def start_span(
136        self,
137        name: str,
138        parent: Span | None = None,
139        **attributes
140    ) -> Span:
141        """Start a new span."""
142        if not self.current_trace:
143            self.current_trace = Trace()
144
145        parent_id = None
146        if parent:
147            parent_id = parent.span_id
148        elif self.span_stack:
149            parent_id = self.span_stack[-1].span_id
150
151        span = Span(
152            trace_id=self.current_trace.trace_id,
153            span_id=str(uuid.uuid4()),
154            name=name,
155            parent_span_id=parent_id,
156            start_time=datetime.utcnow(),
157            attributes={
158                "service.name": self.service_name,
159                **attributes
160            },
161            baggage=self.current_trace.baggage.copy()
162        )
163
164        self.current_trace.add_span(span)
165        self.span_stack.append(span)
166
167        return span
168
169    def end_span(self, span: Span | None = None):
170        """End a span."""
171        if span:
172            span.end()
173            if span in self.span_stack:
174                self.span_stack.remove(span)
175        elif self.span_stack:
176            self.span_stack[-1].end()
177            self.span_stack.pop()
178
179    def get_current_span(self) -> Span | None:
180        """Get the current active span."""
181        return self.span_stack[-1] if self.span_stack else None
182
183    def add_exporter(self, exporter):
184        """Add a trace exporter."""
185        self.exporters.append(exporter)
186
187    def export_trace(self):
188        """Export the current trace."""
189        if self.current_trace:
190            for exporter in self.exporters:
191                exporter.export(self.current_trace)

Creating Spans

Span Decorator Pattern

🐍python
1"""
2Span Creation Patterns
3
4Convenient ways to create spans around operations.
5"""
6
7from contextlib import contextmanager
8from functools import wraps
9from typing import Callable, Generator
10
11
12def traced(
13    name: str | None = None,
14    attributes: dict | None = None
15):
16    """Decorator to trace a function."""
17    def decorator(func: Callable) -> Callable:
18        span_name = name or func.__name__
19
20        @wraps(func)
21        async def async_wrapper(*args, **kwargs):
22            tracer = get_tracer()  # Get global tracer
23            span = tracer.start_span(
24                span_name,
25                **{
26                    "function.name": func.__name__,
27                    "function.module": func.__module__,
28                    **(attributes or {})
29                }
30            )
31            try:
32                result = await func(*args, **kwargs)
33                span.set_status(SpanStatus.OK)
34                return result
35            except Exception as e:
36                span.set_status(SpanStatus.ERROR, str(e))
37                span.add_event("exception", type=type(e).__name__, message=str(e))
38                raise
39            finally:
40                tracer.end_span(span)
41
42        @wraps(func)
43        def sync_wrapper(*args, **kwargs):
44            tracer = get_tracer()
45            span = tracer.start_span(
46                span_name,
47                **{
48                    "function.name": func.__name__,
49                    "function.module": func.__module__,
50                    **(attributes or {})
51                }
52            )
53            try:
54                result = func(*args, **kwargs)
55                span.set_status(SpanStatus.OK)
56                return result
57            except Exception as e:
58                span.set_status(SpanStatus.ERROR, str(e))
59                span.add_event("exception", type=type(e).__name__, message=str(e))
60                raise
61            finally:
62                tracer.end_span(span)
63
64        import asyncio
65        if asyncio.iscoroutinefunction(func):
66            return async_wrapper
67        return sync_wrapper
68
69    return decorator
70
71
72@contextmanager
73def trace_span(
74    name: str,
75    **attributes
76) -> Generator[Span, None, None]:
77    """Context manager for creating spans."""
78    tracer = get_tracer()
79    span = tracer.start_span(name, **attributes)
80    try:
81        yield span
82    except Exception as e:
83        span.set_status(SpanStatus.ERROR, str(e))
84        span.add_event("exception", type=type(e).__name__, message=str(e))
85        raise
86    finally:
87        span.set_status(SpanStatus.OK if span.status == SpanStatus.UNSET else span.status)
88        tracer.end_span(span)
89
90
91# Usage examples
92@traced("llm_call", attributes={"model": "gpt-4"})
93async def call_llm(prompt: str) -> str:
94    """LLM call with automatic tracing."""
95    # LLM call implementation
96    return "response"
97
98
99@traced("tool_execution")
100def execute_tool(tool_name: str, params: dict) -> dict:
101    """Tool execution with automatic tracing."""
102    # Tool execution implementation
103    return {"result": "success"}
104
105
106async def process_request(request: str):
107    """Example of manual span creation."""
108    with trace_span("process_request", request_length=len(request)) as span:
109        span.add_event("request_received")
110
111        # Add custom attributes
112        span.set_attribute("request.type", "query")
113
114        with trace_span("validate_input") as validation_span:
115            is_valid = validate(request)
116            validation_span.set_attribute("valid", is_valid)
117
118        with trace_span("generate_response") as response_span:
119            response = await generate(request)
120            response_span.set_attribute("response.length", len(response))
121
122        return response

Agent-Specific Tracing

Tracing Agent Components

🐍python
1"""
2Agent-Specific Tracing
3
4Trace patterns for common agent operations.
5"""
6
7from dataclasses import dataclass
8from typing import Any
9
10
11class AgentTracer:
12    """Tracer specialized for agent operations."""
13
14    def __init__(self, base_tracer: Tracer):
15        self.tracer = base_tracer
16
17    @contextmanager
18    def trace_agent_run(self, task: str, agent_config: dict):
19        """Trace an entire agent run."""
20        with trace_span(
21            "agent_run",
22            task_preview=task[:100],
23            agent_type=agent_config.get("type"),
24            model=agent_config.get("model")
25        ) as span:
26            span.add_event("agent_started", config=agent_config)
27            yield span
28            span.add_event("agent_completed")
29
30    @contextmanager
31    def trace_llm_call(
32        self,
33        model: str,
34        prompt_tokens: int,
35        temperature: float
36    ):
37        """Trace an LLM API call."""
38        with trace_span(
39            "llm_call",
40            **{
41                "llm.model": model,
42                "llm.prompt_tokens": prompt_tokens,
43                "llm.temperature": temperature
44            }
45        ) as span:
46            start = time.time()
47            yield span
48            span.set_attribute("llm.duration_ms", (time.time() - start) * 1000)
49
50    @contextmanager
51    def trace_tool_call(
52        self,
53        tool_name: str,
54        tool_input: dict
55    ):
56        """Trace a tool invocation."""
57        with trace_span(
58            f"tool:{tool_name}",
59            **{
60                "tool.name": tool_name,
61                "tool.input_size": len(str(tool_input))
62            }
63        ) as span:
64            span.add_event("tool_invoked", input_keys=list(tool_input.keys()))
65            yield span
66
67    @contextmanager
68    def trace_memory_operation(
69        self,
70        operation: str,
71        memory_type: str
72    ):
73        """Trace a memory operation."""
74        with trace_span(
75            f"memory:{operation}",
76            **{
77                "memory.operation": operation,
78                "memory.type": memory_type
79            }
80        ) as span:
81            yield span
82
83    @contextmanager
84    def trace_decision(
85        self,
86        decision_type: str,
87        options: list[str]
88    ):
89        """Trace an agent decision."""
90        with trace_span(
91            f"decision:{decision_type}",
92            **{
93                "decision.type": decision_type,
94                "decision.option_count": len(options)
95            }
96        ) as span:
97            yield span
98
99
100class TracedAgentExecutor:
101    """Agent executor with comprehensive tracing."""
102
103    def __init__(self, agent_tracer: AgentTracer):
104        self.tracer = agent_tracer
105        self.iteration_count = 0
106
107    async def run(self, task: str, config: dict) -> dict:
108        """Run agent with full tracing."""
109        with self.tracer.trace_agent_run(task, config) as root_span:
110            root_span.set_attribute("task.length", len(task))
111
112            iterations = []
113            max_iterations = config.get("max_iterations", 10)
114
115            while self.iteration_count < max_iterations:
116                self.iteration_count += 1
117
118                with trace_span(
119                    f"iteration_{self.iteration_count}",
120                    iteration=self.iteration_count
121                ) as iter_span:
122
123                    # Think
124                    with self.tracer.trace_llm_call(
125                        model=config["model"],
126                        prompt_tokens=1000,
127                        temperature=0.7
128                    ) as llm_span:
129                        thought = await self._think(task)
130                        llm_span.set_attribute("llm.completion_tokens", 200)
131
132                    # Decide
133                    with self.tracer.trace_decision(
134                        "action_selection",
135                        options=["search", "analyze", "respond"]
136                    ) as decision_span:
137                        action = await self._decide(thought)
138                        decision_span.set_attribute("decision.chosen", action["type"])
139
140                    # Act
141                    if action["type"] == "respond":
142                        iter_span.add_event("task_complete")
143                        break
144
145                    with self.tracer.trace_tool_call(
146                        action["type"],
147                        action.get("input", {})
148                    ) as tool_span:
149                        result = await self._act(action)
150                        tool_span.set_attribute("tool.success", result.get("success"))
151
152                    iterations.append({
153                        "thought": thought,
154                        "action": action,
155                        "result": result
156                    })
157
158            root_span.set_attribute("total_iterations", self.iteration_count)
159            return {"iterations": iterations, "task": task}
160
161    async def _think(self, context: str) -> str:
162        return "Thinking about the task..."
163
164    async def _decide(self, thought: str) -> dict:
165        return {"type": "search", "input": {"query": "test"}}
166
167    async def _act(self, action: dict) -> dict:
168        return {"success": True, "data": "result"}

Trace Visualization

Rendering Traces

🐍python
1"""
2Trace Visualization
3
4Generate visual representations of traces.
5"""
6
7from dataclasses import dataclass
8
9
10class TraceRenderer:
11    """Render traces for visualization."""
12
13    def to_timeline(self, trace: Trace) -> str:
14        """Render trace as ASCII timeline."""
15        if not trace.spans:
16            return "Empty trace"
17
18        lines = []
19        root = trace.get_root_span()
20        if not root:
21            return "No root span"
22
23        # Find time bounds
24        min_time = min(s.start_time for s in trace.spans)
25        max_time = max(s.end_time or s.start_time for s in trace.spans)
26        total_duration = (max_time - min_time).total_seconds() * 1000
27
28        def render_span(span: Span, depth: int = 0):
29            indent = "  " * depth
30            start_offset = (span.start_time - min_time).total_seconds() * 1000
31            duration = span.duration_ms or 0
32
33            # Calculate bar position
34            bar_width = 50
35            start_pos = int((start_offset / total_duration) * bar_width) if total_duration > 0 else 0
36            end_pos = int(((start_offset + duration) / total_duration) * bar_width) if total_duration > 0 else 0
37            bar_length = max(1, end_pos - start_pos)
38
39            bar = " " * start_pos + "=" * bar_length
40            status_icon = "+" if span.status == SpanStatus.OK else "x" if span.status == SpanStatus.ERROR else "?"
41
42            lines.append(
43                f"{indent}{status_icon} {span.name} [{duration:.1f}ms]"
44            )
45            lines.append(f"{indent}  |{bar}|")
46
47            # Render children
48            children = [s for s in trace.spans if s.parent_span_id == span.span_id]
49            for child in sorted(children, key=lambda s: s.start_time):
50                render_span(child, depth + 1)
51
52        render_span(root)
53        return "\n".join(lines)
54
55    def to_json(self, trace: Trace) -> dict:
56        """Export trace as JSON for visualization tools."""
57        return {
58            "trace_id": trace.trace_id,
59            "spans": [
60                {
61                    "span_id": s.span_id,
62                    "parent_span_id": s.parent_span_id,
63                    "name": s.name,
64                    "start_time": s.start_time.isoformat(),
65                    "end_time": s.end_time.isoformat() if s.end_time else None,
66                    "duration_ms": s.duration_ms,
67                    "status": s.status.value,
68                    "attributes": s.attributes,
69                    "events": [
70                        {
71                            "name": e.name,
72                            "timestamp": e.timestamp.isoformat(),
73                            "attributes": e.attributes
74                        }
75                        for e in s.events
76                    ]
77                }
78                for s in trace.spans
79            ]
80        }
81
82    def to_jaeger_format(self, trace: Trace) -> dict:
83        """Export in Jaeger-compatible format."""
84        processes = {
85            "p1": {
86                "serviceName": trace.spans[0].attributes.get("service.name", "unknown")
87                if trace.spans else "unknown"
88            }
89        }
90
91        spans = []
92        for s in trace.spans:
93            spans.append({
94                "traceID": trace.trace_id,
95                "spanID": s.span_id,
96                "operationName": s.name,
97                "references": [
98                    {
99                        "refType": "CHILD_OF",
100                        "traceID": trace.trace_id,
101                        "spanID": s.parent_span_id
102                    }
103                ] if s.parent_span_id else [],
104                "startTime": int(s.start_time.timestamp() * 1_000_000),
105                "duration": int(s.duration_ms * 1000) if s.duration_ms else 0,
106                "tags": [
107                    {"key": k, "value": v, "type": "string"}
108                    for k, v in s.attributes.items()
109                ],
110                "logs": [
111                    {
112                        "timestamp": int(e.timestamp.timestamp() * 1_000_000),
113                        "fields": [
114                            {"key": "event", "value": e.name, "type": "string"},
115                            *[
116                                {"key": k, "value": str(v), "type": "string"}
117                                for k, v in e.attributes.items()
118                            ]
119                        ]
120                    }
121                    for e in s.events
122                ],
123                "processID": "p1"
124            })
125
126        return {
127            "data": [{
128                "traceID": trace.trace_id,
129                "spans": spans,
130                "processes": processes
131            }]
132        }
133
134
135# Example usage
136renderer = TraceRenderer()
137
138# Create a sample trace
139tracer = Tracer("agent_service")
140tracer.start_trace("process_query")
141
142with trace_span("validate") as s:
143    s.add_event("validation_passed")
144
145with trace_span("llm_call") as s:
146    s.set_attribute("model", "gpt-4")
147    with trace_span("generate") as inner:
148        inner.set_attribute("tokens", 500)
149
150tracer.end_span()
151
152# Render
153print(renderer.to_timeline(tracer.current_trace))

Key Takeaways

  • Traces provide end-to-end visibility into agent request flows across all components.
  • Spans represent individual operations with timing, status, and custom attributes.
  • Use decorators and context managers for clean, consistent span creation.
  • Agent-specific tracing patterns capture LLM calls, tool usage, memory operations, and decisions.
  • Visualization tools help understand complex agent behavior and identify bottlenecks.
Next Section Preview: We'll explore debugging techniques for understanding and fixing agent behavior issues.