Introduction
Conversations are the primary interface for AI agents, but managing conversation memory is uniquely challenging. Unlike static documents, conversations are dynamic, reference-heavy, and context-dependent. A user might say "do that again" or "the second option"βunderstanding these requires tracking not just what was said, but the flow and structure of the dialogue.
The Conversational Challenge: Users expect agents to remember what was just discussed, understand references to earlier points, and maintain coherent context across a potentially hours-long conversation.
Context Window Challenges
Even with large context windows, conversation memory needs careful management:
The Context Budget
1CONTEXT WINDOW ALLOCATION (Example: 200K tokens)
2
3βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
4β SYSTEM PROMPT (fixed) β ~2K tokens β
5βββββββββββββββββββββββββββββββββββββββββββ¬ββββββββββββββββ€
6β RETRIEVED MEMORIES β ~5K tokens β
7βββββββββββββββββββββββββββββββββββββββββββΌββββββββββββββββ€
8β CONVERSATION HISTORY β Variable β
9β β (most space) β
10βββββββββββββββββββββββββββββββββββββββββββΌββββββββββββββββ€
11β CURRENT TOOL RESULTS β Variable β
12βββββββββββββββββββββββββββββββββββββββββββΌββββββββββββββββ€
13β RESERVED FOR RESPONSE β ~4K tokens β
14βββββββββββββββββββββββββββββββββββββββββββ΄ββββββββββββββββ
15
16CHALLENGE:
17- Long conversations can exceed the budget
18- Each turn adds messages from both user and assistant
19- Tool results can be large and verbose
20- Need strategy for when context fills upWhen Context Overflows
1class ConversationBuffer:
2 """Manage conversation within context limits."""
3
4 def __init__(
5 self,
6 max_tokens: int = 150000, # Leave room for response
7 token_counter=None
8 ):
9 self.max_tokens = max_tokens
10 self.messages: list[dict] = []
11 self.token_counter = token_counter or self._estimate_tokens
12
13 def add_message(self, role: str, content: str) -> None:
14 """Add message, handling overflow if needed."""
15 self.messages.append({"role": role, "content": content})
16 self._handle_overflow()
17
18 def _handle_overflow(self) -> None:
19 """Handle when messages exceed token limit."""
20 while self._total_tokens() > self.max_tokens:
21 if len(self.messages) <= 2:
22 # Keep at least user message and one response
23 break
24
25 # Strategy: Remove oldest non-system messages
26 for i, msg in enumerate(self.messages):
27 if msg["role"] != "system":
28 # Optionally summarize before removing
29 self._archive_message(self.messages.pop(i))
30 break
31
32 def _total_tokens(self) -> int:
33 return sum(
34 self.token_counter(msg["content"])
35 for msg in self.messages
36 )
37
38 def _estimate_tokens(self, text: str) -> int:
39 return len(text) // 4 # Rough approximation
40
41 def _archive_message(self, message: dict) -> None:
42 """Archive removed message for potential later use."""
43 # Could store in long-term memory, log, etc.
44 pass
45
46 def get_messages(self) -> list[dict]:
47 return self.messages.copy()Conversation Summarization
Summarization compresses old conversation into fewer tokens while preserving key information:
Rolling Summarization
1class RollingSummarizer:
2 """Maintain a rolling summary of conversation."""
3
4 def __init__(self, llm, summary_interval: int = 10):
5 self.llm = llm
6 self.summary_interval = summary_interval
7 self.current_summary: str = ""
8 self.recent_messages: list[dict] = []
9 self.turn_count: int = 0
10
11 async def add_turn(
12 self,
13 user_message: str,
14 assistant_response: str
15 ) -> None:
16 """Add a conversation turn."""
17 self.recent_messages.append({
18 "role": "user",
19 "content": user_message
20 })
21 self.recent_messages.append({
22 "role": "assistant",
23 "content": assistant_response
24 })
25 self.turn_count += 1
26
27 # Summarize periodically
28 if self.turn_count % self.summary_interval == 0:
29 await self._update_summary()
30
31 async def _update_summary(self) -> None:
32 """Update the rolling summary."""
33 messages_text = self._format_messages(self.recent_messages)
34
35 prompt = f"""Update this conversation summary with new turns.
36
37CURRENT SUMMARY:
38{self.current_summary or "No previous summary."}
39
40NEW CONVERSATION TURNS:
41{messages_text}
42
43UPDATED SUMMARY (preserve key facts, decisions, and context):"""
44
45 self.current_summary = await self.llm.generate(prompt)
46
47 # Keep only most recent messages
48 self.recent_messages = self.recent_messages[-4:]
49
50 def get_context(self) -> str:
51 """Get context for next LLM call."""
52 if self.current_summary:
53 return f"""CONVERSATION SUMMARY:
54{self.current_summary}
55
56RECENT MESSAGES:
57{self._format_messages(self.recent_messages)}"""
58 else:
59 return self._format_messages(self.recent_messages)
60
61 def _format_messages(self, messages: list[dict]) -> str:
62 return "\n".join([
63 f"{m['role'].upper()}: {m['content']}"
64 for m in messages
65 ])Hierarchical Summarization
1class HierarchicalSummarizer:
2 """Multi-level summarization for very long conversations."""
3
4 def __init__(self, llm):
5 self.llm = llm
6 self.levels = {
7 "turn": [], # Individual turns
8 "segment": [], # Groups of turns (5-10)
9 "session": [], # Full session summaries
10 "overall": "" # Overall context
11 }
12
13 async def add_turn(self, user: str, assistant: str) -> None:
14 turn_summary = await self._summarize_turn(user, assistant)
15 self.levels["turn"].append(turn_summary)
16
17 # Consolidate turns into segments
18 if len(self.levels["turn"]) >= 5:
19 segment = await self._summarize_segment(self.levels["turn"])
20 self.levels["segment"].append(segment)
21 self.levels["turn"] = []
22
23 # Consolidate segments into session
24 if len(self.levels["segment"]) >= 3:
25 session = await self._summarize_session(self.levels["segment"])
26 self.levels["session"].append(session)
27 self.levels["segment"] = []
28
29 # Update overall summary
30 if len(self.levels["session"]) >= 2:
31 self.levels["overall"] = await self._summarize_overall(
32 self.levels["session"]
33 )
34 self.levels["session"] = self.levels["session"][-1:]
35
36 async def _summarize_turn(self, user: str, assistant: str) -> str:
37 prompt = f"""Summarize this exchange in 1-2 sentences:
38USER: {user}
39ASSISTANT: {assistant}"""
40 return await self.llm.generate(prompt)
41
42 async def _summarize_segment(self, turns: list[str]) -> str:
43 prompt = f"""Summarize these conversation turns into a paragraph:
44{chr(10).join(turns)}"""
45 return await self.llm.generate(prompt)
46
47 def get_context(self) -> str:
48 """Build context from all levels."""
49 parts = []
50
51 if self.levels["overall"]:
52 parts.append(f"OVERALL CONTEXT:\n{self.levels['overall']}")
53
54 if self.levels["session"]:
55 parts.append(f"RECENT SESSION:\n{self.levels['session'][-1]}")
56
57 if self.levels["segment"]:
58 parts.append(f"RECENT SEGMENT:\n{chr(10).join(self.levels['segment'])}")
59
60 if self.levels["turn"]:
61 parts.append(f"RECENT TURNS:\n{chr(10).join(self.levels['turn'])}")
62
63 return "\n\n---\n\n".join(parts)Sliding Window Strategies
Different ways to manage which messages stay in context:
Token-Based Window
1class TokenSlidingWindow:
2 """Keep messages within token budget."""
3
4 def __init__(self, max_tokens: int = 50000):
5 self.max_tokens = max_tokens
6 self.messages: list[dict] = []
7
8 def add(self, role: str, content: str) -> list[dict]:
9 """Add message and return messages that fit in window."""
10 self.messages.append({
11 "role": role,
12 "content": content,
13 "tokens": len(content) // 4
14 })
15
16 return self._get_window()
17
18 def _get_window(self) -> list[dict]:
19 """Get messages that fit within token limit."""
20 total = 0
21 window = []
22
23 # Always include from end (most recent)
24 for msg in reversed(self.messages):
25 if total + msg["tokens"] > self.max_tokens:
26 break
27 window.insert(0, {"role": msg["role"], "content": msg["content"]})
28 total += msg["tokens"]
29
30 return windowImportance-Based Window
1class ImportanceWindow:
2 """Keep important messages even when old."""
3
4 def __init__(self, max_tokens: int, llm):
5 self.max_tokens = max_tokens
6 self.llm = llm
7 self.messages: list[dict] = []
8
9 async def add(self, role: str, content: str) -> None:
10 importance = await self._assess_importance(content)
11
12 self.messages.append({
13 "role": role,
14 "content": content,
15 "tokens": len(content) // 4,
16 "importance": importance,
17 "timestamp": datetime.now()
18 })
19
20 async def get_window(self) -> list[dict]:
21 """Get messages balancing recency and importance."""
22 if self._total_tokens() <= self.max_tokens:
23 return [{"role": m["role"], "content": m["content"]}
24 for m in self.messages]
25
26 # Score = importance * recency_weight
27 now = datetime.now()
28 scored = []
29
30 for i, msg in enumerate(self.messages):
31 age_hours = (now - msg["timestamp"]).seconds / 3600
32 recency_weight = 1 / (1 + age_hours) # Decay with age
33 score = msg["importance"] * recency_weight
34
35 # Always keep most recent messages
36 if i >= len(self.messages) - 4:
37 score += 100
38
39 scored.append((msg, score))
40
41 # Sort by score and take what fits
42 scored.sort(key=lambda x: x[1], reverse=True)
43
44 window = []
45 total_tokens = 0
46
47 for msg, score in scored:
48 if total_tokens + msg["tokens"] > self.max_tokens:
49 break
50 window.append(msg)
51 total_tokens += msg["tokens"]
52
53 # Re-sort by timestamp for coherent conversation
54 window.sort(key=lambda x: x["timestamp"])
55
56 return [{"role": m["role"], "content": m["content"]} for m in window]
57
58 async def _assess_importance(self, content: str) -> float:
59 prompt = f"""Rate the importance of remembering this message (0-1).
60High importance: decisions, preferences, key facts, instructions
61Low importance: greetings, acknowledgments, filler
62
63Message: {content}
64
65Return just a number between 0 and 1."""
66
67 response = await self.llm.generate(prompt)
68 try:
69 return float(response.strip())
70 except:
71 return 0.5Entity and Reference Tracking
Conversations are full of references that require tracking context:
Entity Extraction and Tracking
1from dataclasses import dataclass, field
2from typing import Optional
3
4@dataclass
5class Entity:
6 name: str
7 type: str # person, place, thing, concept
8 aliases: set[str] = field(default_factory=set)
9 properties: dict = field(default_factory=dict)
10 last_mentioned: int = 0 # Turn number
11
12class EntityTracker:
13 """Track entities mentioned in conversation."""
14
15 def __init__(self, llm):
16 self.llm = llm
17 self.entities: dict[str, Entity] = {}
18 self.current_turn = 0
19
20 async def process_message(self, content: str) -> None:
21 """Extract and track entities from message."""
22 self.current_turn += 1
23
24 # Extract entities
25 entities = await self._extract_entities(content)
26
27 for entity_data in entities:
28 name = entity_data["name"].lower()
29
30 if name in self.entities:
31 # Update existing entity
32 entity = self.entities[name]
33 entity.last_mentioned = self.current_turn
34 if "properties" in entity_data:
35 entity.properties.update(entity_data["properties"])
36 else:
37 # Create new entity
38 self.entities[name] = Entity(
39 name=entity_data["name"],
40 type=entity_data["type"],
41 aliases=set(entity_data.get("aliases", [])),
42 properties=entity_data.get("properties", {}),
43 last_mentioned=self.current_turn
44 )
45
46 async def resolve_reference(self, reference: str) -> Optional[Entity]:
47 """Resolve a pronoun or reference to an entity."""
48 ref_lower = reference.lower()
49
50 # Direct match
51 if ref_lower in self.entities:
52 return self.entities[ref_lower]
53
54 # Check aliases
55 for entity in self.entities.values():
56 if ref_lower in entity.aliases:
57 return entity
58
59 # Use LLM for ambiguous references
60 if reference in ["it", "that", "this", "they", "them"]:
61 # Get recently mentioned entities
62 recent = sorted(
63 self.entities.values(),
64 key=lambda e: e.last_mentioned,
65 reverse=True
66 )[:5]
67
68 if recent:
69 prompt = f"""Given these recently mentioned entities:
70{[e.name for e in recent]}
71
72What does "{reference}" most likely refer to?
73Return just the entity name or "unknown"."""
74
75 result = await self.llm.generate(prompt)
76 result_lower = result.strip().lower()
77
78 if result_lower in self.entities:
79 return self.entities[result_lower]
80
81 return None
82
83 async def _extract_entities(self, content: str) -> list[dict]:
84 prompt = f"""Extract entities from this text.
85Return JSON array: [{{"name": "...", "type": "person|place|thing|concept",
86 "aliases": [...], "properties": {{...}}}}]
87
88Text: {content}"""
89
90 response = await self.llm.generate(prompt)
91 try:
92 return json.loads(response)
93 except:
94 return []
95
96 def get_context_summary(self) -> str:
97 """Get summary of tracked entities for context."""
98 if not self.entities:
99 return ""
100
101 lines = ["KNOWN ENTITIES:"]
102 for entity in self.entities.values():
103 props = ", ".join(f"{k}={v}" for k, v in entity.properties.items())
104 lines.append(f"- {entity.name} ({entity.type}): {props}")
105
106 return "\n".join(lines)Multi-Session Continuity
Maintaining context across multiple conversation sessions:
1class SessionManager:
2 """Manage continuity across conversation sessions."""
3
4 def __init__(
5 self,
6 session_store, # Database for session data
7 llm,
8 max_sessions_to_recall: int = 3
9 ):
10 self.store = session_store
11 self.llm = llm
12 self.max_sessions = max_sessions_to_recall
13 self.current_session_id: str = None
14
15 async def start_session(self, user_id: str) -> str:
16 """Start a new session, loading relevant history."""
17 self.current_session_id = self._generate_session_id()
18
19 # Load previous sessions
20 past_sessions = await self.store.get_user_sessions(
21 user_id,
22 limit=self.max_sessions
23 )
24
25 # Build continuity context
26 continuity = await self._build_continuity_context(past_sessions)
27
28 # Save session start
29 await self.store.create_session(
30 session_id=self.current_session_id,
31 user_id=user_id,
32 started_at=datetime.now()
33 )
34
35 return continuity
36
37 async def end_session(
38 self,
39 user_id: str,
40 conversation: list[dict]
41 ) -> None:
42 """End session and save summary for future continuity."""
43
44 # Generate session summary
45 summary = await self._summarize_session(conversation)
46
47 # Extract key facts and decisions
48 facts = await self._extract_session_facts(conversation)
49
50 # Save session data
51 await self.store.update_session(
52 session_id=self.current_session_id,
53 ended_at=datetime.now(),
54 summary=summary,
55 facts=facts,
56 message_count=len(conversation)
57 )
58
59 async def _build_continuity_context(
60 self,
61 sessions: list[dict]
62 ) -> str:
63 if not sessions:
64 return ""
65
66 parts = ["PREVIOUS SESSION CONTEXT:"]
67
68 for session in sessions:
69 date_str = session["started_at"].strftime("%Y-%m-%d")
70 parts.append(f"\n[Session from {date_str}]")
71 parts.append(session["summary"])
72
73 if session.get("facts"):
74 parts.append("Key points:")
75 for fact in session["facts"][:5]:
76 parts.append(f" - {fact}")
77
78 return "\n".join(parts)
79
80 async def _summarize_session(self, conversation: list[dict]) -> str:
81 conv_text = "\n".join([
82 f"{m['role'].upper()}: {m['content'][:500]}"
83 for m in conversation[-20:] # Last 20 messages
84 ])
85
86 prompt = f"""Summarize this conversation session in 2-3 sentences.
87Focus on: main topics, outcomes, any commitments or next steps.
88
89{conv_text}"""
90
91 return await self.llm.generate(prompt)
92
93 async def _extract_session_facts(
94 self,
95 conversation: list[dict]
96 ) -> list[str]:
97 conv_text = "\n".join([
98 f"{m['role'].upper()}: {m['content'][:300]}"
99 for m in conversation
100 ])
101
102 prompt = f"""Extract key facts from this conversation that should
103be remembered for future sessions. Return as JSON array of strings.
104
105Only include:
106- User preferences stated
107- Decisions made
108- Important context about ongoing work
109- Commitments or promises
110
111{conv_text}"""
112
113 response = await self.llm.generate(prompt)
114 try:
115 return json.loads(response)
116 except:
117 return []Session Greeting
Summary
Key strategies for conversation memory:
- Budget context: Allocate tokens between system prompt, history, tools, and response
- Summarize progressively: Rolling or hierarchical summarization as conversation grows
- Sliding windows: Keep recent messages, optionally weighted by importance
- Track entities: Maintain knowledge of people, places, and things mentioned
- Resolve references: Connect pronouns and references to specific entities
- Cross-session continuity: Save summaries and facts for future sessions
Next: We'll explore knowledge graphsβstructured memory that captures relationships between concepts.